offlinecopy moved to wget instead of python requests
This commit is contained in:
parent
185e9d200c
commit
c7ab932ac7
3 changed files with 386 additions and 397 deletions
604
nasg.py
604
nasg.py
|
@ -210,254 +210,6 @@ class Indexer(object):
|
||||||
self.writer.commit()
|
self.writer.commit()
|
||||||
|
|
||||||
|
|
||||||
class OfflineArchive(object):
|
|
||||||
# keep in mind that these are frontmattered HTML files with full HTML and embedded images
|
|
||||||
# they can get VERY large
|
|
||||||
def __init__(self, url, content=None, decode_email=False):
|
|
||||||
self.url = url
|
|
||||||
self.parsed = urllib.parse.urlparse(url)
|
|
||||||
self.fbase = shared.slugfname(url)
|
|
||||||
self.fname = "%s.md" % self.fbase
|
|
||||||
self.target = os.path.join(
|
|
||||||
shared.config.get('source', 'offlinecopiesdir'),
|
|
||||||
self.fname
|
|
||||||
)
|
|
||||||
self.targetd = os.path.join(
|
|
||||||
shared.config.get('source', 'offlinecopiesdir'),
|
|
||||||
self.fbase
|
|
||||||
)
|
|
||||||
if not os.path.isdir(self.targetd):
|
|
||||||
os.mkdir(self.targetd)
|
|
||||||
|
|
||||||
self.fm = frontmatter.loads('')
|
|
||||||
self.fm.metadata = {
|
|
||||||
'url': self.url,
|
|
||||||
'date': arrow.utcnow().format("YYYY-MM-DDTHH:mm:ssZ"),
|
|
||||||
}
|
|
||||||
self.headers = requests.utils.default_headers()
|
|
||||||
self.headers.update({
|
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
|
||||||
})
|
|
||||||
|
|
||||||
self.skip_fetch = False
|
|
||||||
if content:
|
|
||||||
self.skip_fetch = True
|
|
||||||
if decode_email:
|
|
||||||
content = quopri.decodestring(content)
|
|
||||||
content = str(content, 'utf-8', errors='replace')
|
|
||||||
self.fm.content = content
|
|
||||||
#self.tmp = tempfile.mkdtemp(
|
|
||||||
#'offlinearchive_',
|
|
||||||
#dir=tempfile.gettempdir()
|
|
||||||
#)
|
|
||||||
#atexit.register(
|
|
||||||
#shutil.rmtree,
|
|
||||||
#os.path.abspath(self.tmp)
|
|
||||||
#)
|
|
||||||
#self.images = []
|
|
||||||
|
|
||||||
self.exists = os.path.isfile(self.target)
|
|
||||||
|
|
||||||
#def read(self):
|
|
||||||
#if not self.exists:
|
|
||||||
#return ''
|
|
||||||
|
|
||||||
#with open(self.target, 'rt') as f:
|
|
||||||
#self.fm = frontmatter.loads(f.read())
|
|
||||||
|
|
||||||
#readable = ''
|
|
||||||
#try:
|
|
||||||
#readable = Document(self.fm.content)
|
|
||||||
#readable = shared.Pandoc(False).convert(readable.summary())
|
|
||||||
#readable = shared.Pandoc().convert(readable)
|
|
||||||
#except Exception as e:
|
|
||||||
#logging.error('Failed to readable %s', self.target)
|
|
||||||
|
|
||||||
#return readable
|
|
||||||
|
|
||||||
|
|
||||||
def _getimage(self, src):
|
|
||||||
imgname, imgext = os.path.splitext(os.path.basename(src))
|
|
||||||
imgtarget = os.path.join(
|
|
||||||
self.targetd,
|
|
||||||
"%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext)
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
logging.debug('donwloading image %s', src)
|
|
||||||
r = requests.get(
|
|
||||||
src,
|
|
||||||
allow_redirects=True,
|
|
||||||
timeout=60,
|
|
||||||
stream=True
|
|
||||||
)
|
|
||||||
with open(imgtarget, 'wb') as f:
|
|
||||||
for chunk in r.iter_content():
|
|
||||||
if chunk:
|
|
||||||
f.write(chunk)
|
|
||||||
|
|
||||||
self.fm.content = self.fm.content.replace(
|
|
||||||
src,
|
|
||||||
'%s/%s' % (self.fbase, imgname)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error('pulling image %s failed: %s', src, e)
|
|
||||||
return
|
|
||||||
|
|
||||||
def _get_images(self):
|
|
||||||
logging.debug("trying to save images")
|
|
||||||
soup = BeautifulSoup(self.fm.content, 'lxml')
|
|
||||||
|
|
||||||
embedded = re.compile(r'^data:.*')
|
|
||||||
for img in soup.find_all('img'):
|
|
||||||
src = img.get('src')
|
|
||||||
if not src:
|
|
||||||
continue
|
|
||||||
if embedded.match(src):
|
|
||||||
continue
|
|
||||||
|
|
||||||
im = urllib.parse.urlparse(src)
|
|
||||||
if not im.scheme:
|
|
||||||
im = im._replace(scheme=self.parsed.scheme)
|
|
||||||
if not im.netloc:
|
|
||||||
im = im._replace(netloc=self.parsed.netloc)
|
|
||||||
|
|
||||||
self._getimage(im.geturl())
|
|
||||||
|
|
||||||
|
|
||||||
#def _getimage(self, src):
|
|
||||||
#tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200])
|
|
||||||
#try:
|
|
||||||
#r = requests.get(
|
|
||||||
#src,
|
|
||||||
#allow_redirects=True,
|
|
||||||
#timeout=60,
|
|
||||||
#stream=True
|
|
||||||
#)
|
|
||||||
#with open(tmp, 'wb') as f:
|
|
||||||
#for chunk in r.iter_content():
|
|
||||||
#if chunk:
|
|
||||||
#f.write(chunk)
|
|
||||||
|
|
||||||
#logging.debug('trying to embed %s', src)
|
|
||||||
#with open(tmp, 'rb') as imgdata:
|
|
||||||
#data = str(base64.b64encode(imgdata.read()), 'ascii')
|
|
||||||
#mimetype, encoding = mimetypes.guess_type(tmp)
|
|
||||||
#self.fm.content = self.fm.content.replace(
|
|
||||||
#src,
|
|
||||||
#"data:%s;base64,%s" % (mimetype, data)
|
|
||||||
#)
|
|
||||||
#except Exception as e:
|
|
||||||
#logging.error('pulling image %s failed: %s', src, e)
|
|
||||||
#return
|
|
||||||
|
|
||||||
#def _embed_images(self):
|
|
||||||
#logging.debug("trying to embed images")
|
|
||||||
#soup = BeautifulSoup(self.fm.content, 'lxml')
|
|
||||||
|
|
||||||
#embedded = re.compile(r'^data:.*')
|
|
||||||
#for img in soup.find_all('img'):
|
|
||||||
#src = img.get('src')
|
|
||||||
#if not src:
|
|
||||||
#continue
|
|
||||||
#if embedded.match(src):
|
|
||||||
#continue
|
|
||||||
|
|
||||||
#im = urllib.parse.urlparse(src)
|
|
||||||
#if not im.scheme:
|
|
||||||
#im = im._replace(scheme=self.parsed.scheme)
|
|
||||||
#if not im.netloc:
|
|
||||||
#im = im._replace(netloc=self.parsed.netloc)
|
|
||||||
|
|
||||||
#self._getimage(im.geturl())
|
|
||||||
|
|
||||||
|
|
||||||
def save(self):
|
|
||||||
logging.info(
|
|
||||||
"savig offline copy of\n\t%s to:\n\t%s",
|
|
||||||
self.url,
|
|
||||||
self.target
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(self.target, 'wt') as f:
|
|
||||||
f.write(frontmatter.dumps(self.fm))
|
|
||||||
|
|
||||||
@property
|
|
||||||
def archiveorgurl(self):
|
|
||||||
logging.debug("trying archive.org for %s", self.url)
|
|
||||||
a = self.fetch(
|
|
||||||
"http://archive.org/wayback/available?url=%s" % self.url,
|
|
||||||
)
|
|
||||||
if not a:
|
|
||||||
logging.debug("no entry for %s on archive.org", self.url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
a = json.loads(a.text)
|
|
||||||
aurl = a.get(
|
|
||||||
'archived_snapshots', {}
|
|
||||||
).get(
|
|
||||||
'closest', {}
|
|
||||||
).get(
|
|
||||||
'url', None
|
|
||||||
)
|
|
||||||
logging.debug("found %s in archive.org for %s", aurl, self.url)
|
|
||||||
self.updateurl(aurl)
|
|
||||||
return self.fetch(aurl)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("archive.org parsing failed: %s", e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def fetch(self, url):
|
|
||||||
try:
|
|
||||||
r = requests.get(
|
|
||||||
self.url,
|
|
||||||
allow_redirects=True,
|
|
||||||
timeout=60,
|
|
||||||
headers=self.headers
|
|
||||||
)
|
|
||||||
if r.status_code == requests.codes.ok:
|
|
||||||
return r
|
|
||||||
except Exception as e:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
#def read():
|
|
||||||
#if os.path.isfile(self.target):
|
|
||||||
#with open(self.target) as f:
|
|
||||||
#self.fm = frontmatter.loads(f.read())
|
|
||||||
#return
|
|
||||||
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
if self.exists:
|
|
||||||
logging.info("offline archive for %s already exists", self.url)
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info("prepairing offline copy of %s", self.url)
|
|
||||||
|
|
||||||
if not self.skip_fetch:
|
|
||||||
r = self.fetch(self.url)
|
|
||||||
|
|
||||||
# in case it's not, try to look for an archive.org url:
|
|
||||||
if not r:
|
|
||||||
logging.warning("couldn't get live version of %s, trying archive.org", self.url)
|
|
||||||
r = self.fetch(self.archiveorgurl)
|
|
||||||
|
|
||||||
# no live and no archive.org entry :((
|
|
||||||
# howver, by miracle, I may already have a copy, so skip if it's there already
|
|
||||||
if not r:
|
|
||||||
logging.error("no live or archive version of %s found :((", self.url)
|
|
||||||
if not self.exists:
|
|
||||||
self.save()
|
|
||||||
return
|
|
||||||
|
|
||||||
self.fm.content = r.text
|
|
||||||
|
|
||||||
self._get_images()
|
|
||||||
self.save()
|
|
||||||
|
|
||||||
|
|
||||||
class Renderer(object):
|
class Renderer(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.sitevars = dict(shared.config.items('site'))
|
self.sitevars = dict(shared.config.items('site'))
|
||||||
|
@ -604,8 +356,8 @@ class Comment(BaseRenderable):
|
||||||
self._tmplvars = {
|
self._tmplvars = {
|
||||||
'published': self.published.datetime,
|
'published': self.published.datetime,
|
||||||
'author': self.meta.get('author', {}),
|
'author': self.meta.get('author', {}),
|
||||||
'content': self.content,
|
#'content': self.content,
|
||||||
'html': self.html,
|
#'html': self.html,
|
||||||
'source': self.source,
|
'source': self.source,
|
||||||
'target': self.targeturl,
|
'target': self.targeturl,
|
||||||
'type': self.meta.get('type', 'webmention'),
|
'type': self.meta.get('type', 'webmention'),
|
||||||
|
@ -795,8 +547,9 @@ class WebImage(object):
|
||||||
shared.config.get('target', 'filesdir'),
|
shared.config.get('target', 'filesdir'),
|
||||||
fname
|
fname
|
||||||
),
|
),
|
||||||
'url': "%s/%s/%s" % (
|
'url': "/%s/%s" % (
|
||||||
shared.config.get('site', 'url'),
|
#'url': "%s/%s/%s" % (
|
||||||
|
#shared.config.get('site', 'url'),
|
||||||
shared.config.get('source', 'files'),
|
shared.config.get('source', 'files'),
|
||||||
fname
|
fname
|
||||||
),
|
),
|
||||||
|
@ -812,8 +565,9 @@ class WebImage(object):
|
||||||
self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url']
|
self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url']
|
||||||
self.target = self.sizes[-1][1]['url']
|
self.target = self.sizes[-1][1]['url']
|
||||||
else:
|
else:
|
||||||
self.small = self.fallback = "%s/%s/%s" % (
|
self.small = self.fallback = "/%s/%s" % (
|
||||||
shared.config.get('site', 'url'),
|
#self.small = self.fallback = "%s/%s/%s" % (
|
||||||
|
#shared.config.get('site', 'url'),
|
||||||
shared.config.get('source', 'files'),
|
shared.config.get('source', 'files'),
|
||||||
"%s%s" % (self.fname, self.ext)
|
"%s%s" % (self.fname, self.ext)
|
||||||
)
|
)
|
||||||
|
@ -1129,12 +883,12 @@ class Taxonomy(BaseIter):
|
||||||
self.taxonomy = taxonomy
|
self.taxonomy = taxonomy
|
||||||
|
|
||||||
|
|
||||||
@property
|
#@property
|
||||||
def pages(self):
|
#def pages(self):
|
||||||
if hasattr(self, '_pages'):
|
#if hasattr(self, '_pages'):
|
||||||
return self._pages
|
#return self._pages
|
||||||
self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination'))
|
#self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination'))
|
||||||
return self._pages
|
#return self._pages
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "taxonomy %s with %d items" % (self.taxonomy, len(self.data))
|
return "taxonomy %s with %d items" % (self.taxonomy, len(self.data))
|
||||||
|
@ -1184,71 +938,48 @@ class Taxonomy(BaseIter):
|
||||||
|
|
||||||
def __mkdirs(self):
|
def __mkdirs(self):
|
||||||
check = [self.basep, self.myp, self.feedp]
|
check = [self.basep, self.myp, self.feedp]
|
||||||
|
|
||||||
if self.pages > 1:
|
|
||||||
check.append(self.pagep)
|
|
||||||
for i in range(2, self.pages+1):
|
|
||||||
subpagep = os.path.abspath(os.path.join(
|
|
||||||
self.pagep,
|
|
||||||
'%d' % i
|
|
||||||
))
|
|
||||||
check.append(subpagep)
|
|
||||||
|
|
||||||
for p in check:
|
for p in check:
|
||||||
if not os.path.isdir(p):
|
if not os.path.isdir(p):
|
||||||
logging.debug("creating dir %s", p)
|
logging.debug("creating dir %s", p)
|
||||||
os.mkdir(p)
|
os.mkdir(p)
|
||||||
|
|
||||||
|
|
||||||
def tpath(self, page):
|
def tpath(self, page):
|
||||||
if page == 1:
|
if page == 1:
|
||||||
return "%s/index.html" % (self.myp)
|
p = "%s" % (self.myp)
|
||||||
else:
|
else:
|
||||||
return "%s/%d/index.html" % (self.pagep, page)
|
p = os.path.join(self.pagep, "%d" % page)
|
||||||
|
|
||||||
|
if not os.path.isdir(p):
|
||||||
|
logging.debug("creating dir %s", p)
|
||||||
|
os.mkdir(p)
|
||||||
|
|
||||||
async def grender(self, renderer):
|
return os.path.join(p, "index.html")
|
||||||
#if not self.slug or self.slug is 'None':
|
|
||||||
#return
|
|
||||||
|
|
||||||
self.__mkdirs()
|
@property
|
||||||
target = self.tpath(1)
|
def is_singlepage(self):
|
||||||
target = target.replace('index', 'gallery')
|
spcats = shared.config.get('common', 'onepagecategories').split(',')
|
||||||
|
if self.name in spcats and 'category' == self.taxonomy:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
if not shared.config.getboolean('params', 'force') and os.path.isfile(target):
|
def posttmpls(self, order='time', start=0, end=None):
|
||||||
ttime = int(os.path.getmtime(target))
|
end = end or len(self.data)
|
||||||
mtime = self.mtime
|
if 'all' == order:
|
||||||
if ttime == mtime:
|
return [
|
||||||
logging.info('taxonomy index for "%s" exists and up-to-date (lastmod: %d)', self.slug, ttime)
|
i.tmplvars
|
||||||
return
|
for k, i in list(sorted(
|
||||||
else:
|
self.data.items(),
|
||||||
logging.info('taxonomy update needed: %s timestamp is %d, last post timestamp is %d (%s)',
|
key=lambda value: value[1].title.lower()
|
||||||
target,
|
))
|
||||||
ttime,
|
]
|
||||||
mtime,
|
|
||||||
self.data[mtime].fname
|
|
||||||
)
|
|
||||||
|
|
||||||
posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
return [
|
||||||
self.data.keys(), reverse=True))]
|
self.data[k].tmplvars
|
||||||
|
for k in list(sorted(
|
||||||
logging.info("rendering gallery to %s", target)
|
self.data.keys(),
|
||||||
tmplvars = {
|
reverse=True
|
||||||
'taxonomy': {
|
))[start:end]
|
||||||
'url': self.baseurl,
|
]
|
||||||
'name': self.name,
|
|
||||||
'slug': self.slug,
|
|
||||||
'taxonomy': self.taxonomy,
|
|
||||||
'lastmod': arrow.get(self.mtime).datetime
|
|
||||||
},
|
|
||||||
'site': renderer.sitevars,
|
|
||||||
'posts': posttmpls,
|
|
||||||
}
|
|
||||||
|
|
||||||
r = renderer.j2.get_template('gallery.html').render(tmplvars)
|
|
||||||
with open(target, "wt") as html:
|
|
||||||
html.write(r)
|
|
||||||
os.utime(target, (self.mtime, self.mtime))
|
|
||||||
|
|
||||||
|
|
||||||
async def render(self, renderer):
|
async def render(self, renderer):
|
||||||
|
@ -1272,20 +1003,59 @@ class Taxonomy(BaseIter):
|
||||||
self.data[mtime].fname
|
self.data[mtime].fname
|
||||||
)
|
)
|
||||||
|
|
||||||
while page <= self.pages:
|
if self.is_singlepage:
|
||||||
self.renderpage(renderer, page)
|
pagination = len(self.data)
|
||||||
|
else:
|
||||||
|
pagination = shared.config.getint('common', 'pagination')
|
||||||
|
pages = math.ceil(len(self.data) / pagination)
|
||||||
|
|
||||||
|
while page <= pages:
|
||||||
|
self.render_page(renderer, page, pagination, pages)
|
||||||
page = page+1
|
page = page+1
|
||||||
|
|
||||||
|
self.render_feeds(renderer)
|
||||||
|
self.ping_websub
|
||||||
|
|
||||||
def renderpage(self, renderer, page):
|
|
||||||
pagination = int(shared.config.get('common', 'pagination'))
|
def render_feeds(self, renderer):
|
||||||
start = int((page-1) * pagination)
|
pagination = shared.config.getint('common', 'pagination')
|
||||||
|
start = 0
|
||||||
end = int(start + pagination)
|
end = int(start + pagination)
|
||||||
|
posttmpls = self.posttmpls('time', start, end)
|
||||||
|
tmplvars = {
|
||||||
|
'taxonomy': {
|
||||||
|
'url': self.baseurl,
|
||||||
|
'name': self.name,
|
||||||
|
'slug': self.slug,
|
||||||
|
'taxonomy': self.taxonomy,
|
||||||
|
'lastmod': arrow.get(self.mtime).datetime
|
||||||
|
},
|
||||||
|
'site': renderer.sitevars,
|
||||||
|
'posts': posttmpls,
|
||||||
|
}
|
||||||
|
|
||||||
posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
target = os.path.join(self.feedp, 'index.atom')
|
||||||
self.data.keys(), reverse=True))[start:end]]
|
logging.info("rendering Atom feed to %s", target)
|
||||||
|
r = renderer.j2.get_template('atom.html').render(tmplvars)
|
||||||
|
with open(target, "wt") as html:
|
||||||
|
html.write(r)
|
||||||
|
os.utime(target, (self.mtime, self.mtime))
|
||||||
|
|
||||||
|
|
||||||
|
def render_page(self, renderer, page, pagination, pages):
|
||||||
|
if self.is_singlepage:
|
||||||
|
posttmpls = self.posttmpls('all')
|
||||||
|
else:
|
||||||
|
start = int((page-1) * pagination)
|
||||||
|
end = int(start + pagination)
|
||||||
|
posttmpls = self.posttmpls('time', start, end)
|
||||||
|
|
||||||
target = self.tpath(page)
|
target = self.tpath(page)
|
||||||
|
tdir = os.path.dirname(target)
|
||||||
|
if not os.path.isdir(tdir):
|
||||||
|
logging.debug("creating dir %s", tdir)
|
||||||
|
os.mkdir(tdir)
|
||||||
|
|
||||||
logging.info("rendering taxonomy page %d to %s", page, target)
|
logging.info("rendering taxonomy page %d to %s", page, target)
|
||||||
tmplvars = {
|
tmplvars = {
|
||||||
'taxonomy': {
|
'taxonomy': {
|
||||||
|
@ -1294,7 +1064,7 @@ class Taxonomy(BaseIter):
|
||||||
'slug': self.slug,
|
'slug': self.slug,
|
||||||
'taxonomy': self.taxonomy,
|
'taxonomy': self.taxonomy,
|
||||||
'paged': page,
|
'paged': page,
|
||||||
'total': self.pages,
|
'total': pages,
|
||||||
'perpage': pagination,
|
'perpage': pagination,
|
||||||
'lastmod': arrow.get(self.mtime).datetime
|
'lastmod': arrow.get(self.mtime).datetime
|
||||||
},
|
},
|
||||||
|
@ -1307,64 +1077,108 @@ class Taxonomy(BaseIter):
|
||||||
html.write(r)
|
html.write(r)
|
||||||
os.utime(target, (self.mtime, self.mtime))
|
os.utime(target, (self.mtime, self.mtime))
|
||||||
|
|
||||||
if 1 == page:
|
|
||||||
#target = os.path.join(self.feedp, 'index.rss')
|
def ping_websub(self):
|
||||||
#logging.info("rendering RSS feed to %s", target)
|
if not self.taxonomy or self.taxonomy == 'category':
|
||||||
#r = renderer.j2.get_template('rss.html').render(tmplvars)
|
t = shared.config.get('site', 'websuburl')
|
||||||
|
data = {
|
||||||
|
'hub.mode': 'publish',
|
||||||
|
'hub.url': "%s%s" % (
|
||||||
|
shared.config.get('site', 'url'), self.baseurl
|
||||||
|
)
|
||||||
|
}
|
||||||
|
logging.info("pinging %s with data %s", t, data)
|
||||||
|
requests.post(t, data=data)
|
||||||
|
|
||||||
|
|
||||||
|
#def renderpage(self, renderer, page):
|
||||||
|
#pagination = int(shared.config.get('common', 'pagination'))
|
||||||
|
#start = int((page-1) * pagination)
|
||||||
|
#end = int(start + pagination)
|
||||||
|
|
||||||
|
#posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
||||||
|
#self.data.keys(), reverse=True))[start:end]]
|
||||||
|
|
||||||
|
#target = self.tpath(page)
|
||||||
|
#logging.info("rendering taxonomy page %d to %s", page, target)
|
||||||
|
#tmplvars = {
|
||||||
|
#'taxonomy': {
|
||||||
|
#'url': self.baseurl,
|
||||||
|
#'name': self.name,
|
||||||
|
#'slug': self.slug,
|
||||||
|
#'taxonomy': self.taxonomy,
|
||||||
|
#'paged': page,
|
||||||
|
#'total': self.pages,
|
||||||
|
#'perpage': pagination,
|
||||||
|
#'lastmod': arrow.get(self.mtime).datetime
|
||||||
|
#},
|
||||||
|
#'site': renderer.sitevars,
|
||||||
|
#'posts': posttmpls,
|
||||||
|
#}
|
||||||
|
|
||||||
|
#r = renderer.j2.get_template('archive.html').render(tmplvars)
|
||||||
|
#with open(target, "wt") as html:
|
||||||
|
#html.write(r)
|
||||||
|
#os.utime(target, (self.mtime, self.mtime))
|
||||||
|
|
||||||
|
#if 1 == page:
|
||||||
|
##target = os.path.join(self.feedp, 'index.rss')
|
||||||
|
##logging.info("rendering RSS feed to %s", target)
|
||||||
|
##r = renderer.j2.get_template('rss.html').render(tmplvars)
|
||||||
|
##with open(target, "wt") as html:
|
||||||
|
##html.write(r)
|
||||||
|
##os.utime(target, (self.mtime, self.mtime))
|
||||||
|
|
||||||
|
#target = os.path.join(self.feedp, 'index.atom')
|
||||||
|
#logging.info("rendering Atom feed to %s", target)
|
||||||
|
#r = renderer.j2.get_template('atom.html').render(tmplvars)
|
||||||
#with open(target, "wt") as html:
|
#with open(target, "wt") as html:
|
||||||
#html.write(r)
|
#html.write(r)
|
||||||
#os.utime(target, (self.mtime, self.mtime))
|
#os.utime(target, (self.mtime, self.mtime))
|
||||||
|
|
||||||
target = os.path.join(self.feedp, 'index.atom')
|
## ---
|
||||||
logging.info("rendering Atom feed to %s", target)
|
## this is a joke
|
||||||
r = renderer.j2.get_template('atom.html').render(tmplvars)
|
## see http://indieweb.org/YAMLFeed
|
||||||
with open(target, "wt") as html:
|
## don't do YAMLFeeds.
|
||||||
html.write(r)
|
#if 1 == page:
|
||||||
os.utime(target, (self.mtime, self.mtime))
|
#fm = frontmatter.loads('')
|
||||||
|
#fm.metadata = {
|
||||||
|
#'site': {
|
||||||
|
#'author': renderer.sitevars['author'],
|
||||||
|
#'url': renderer.sitevars['url'],
|
||||||
|
#'title': renderer.sitevars['title'],
|
||||||
|
#},
|
||||||
|
#'items': [],
|
||||||
|
#}
|
||||||
|
|
||||||
# ---
|
#for p in posttmpls:
|
||||||
# this is a joke
|
#fm.metadata['items'].append({
|
||||||
# see http://indieweb.org/YAMLFeed
|
#'title': p['title'],
|
||||||
# don't do YAMLFeeds.
|
#'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']),
|
||||||
if 1 == page:
|
#'content': p['content'],
|
||||||
fm = frontmatter.loads('')
|
#'summary': p['summary'],
|
||||||
fm.metadata = {
|
#'published': p['published'],
|
||||||
'site': {
|
#'updated': p['updated'],
|
||||||
'author': renderer.sitevars['author'],
|
#})
|
||||||
'url': renderer.sitevars['url'],
|
|
||||||
'title': renderer.sitevars['title'],
|
|
||||||
},
|
|
||||||
'items': [],
|
|
||||||
}
|
|
||||||
|
|
||||||
for p in posttmpls:
|
#target = os.path.join(self.feedp, 'index.yml')
|
||||||
fm.metadata['items'].append({
|
#logging.info("rendering YAML feed to %s", target)
|
||||||
'title': p['title'],
|
#with open(target, "wt") as html:
|
||||||
'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']),
|
#html.write(frontmatter.dumps(fm))
|
||||||
'content': p['content'],
|
#os.utime(target, (self.mtime, self.mtime))
|
||||||
'summary': p['summary'],
|
## ---
|
||||||
'published': p['published'],
|
|
||||||
'updated': p['updated'],
|
|
||||||
})
|
|
||||||
|
|
||||||
target = os.path.join(self.feedp, 'index.yml')
|
#if 1 == page:
|
||||||
logging.info("rendering YAML feed to %s", target)
|
#if not self.taxonomy or self.taxonomy == 'category':
|
||||||
with open(target, "wt") as html:
|
#t = shared.config.get('site', 'websuburl')
|
||||||
html.write(frontmatter.dumps(fm))
|
#data = {
|
||||||
os.utime(target, (self.mtime, self.mtime))
|
#'hub.mode': 'publish',
|
||||||
# ---
|
#'hub.url': "%s%s" % (
|
||||||
|
#shared.config.get('site', 'url'), self.baseurl
|
||||||
if 1 == page:
|
#)
|
||||||
if not self.taxonomy or self.taxonomy == 'category':
|
#}
|
||||||
t = shared.config.get('site', 'websuburl')
|
#logging.info("pinging %s with data %s", t, data)
|
||||||
data = {
|
#requests.post(t, data=data)
|
||||||
'hub.mode': 'publish',
|
|
||||||
'hub.url': "%s%s" % (
|
|
||||||
shared.config.get('site', 'url'), self.baseurl
|
|
||||||
)
|
|
||||||
}
|
|
||||||
logging.info("pinging %s with data %s", t, data)
|
|
||||||
requests.post(t, data=data)
|
|
||||||
|
|
||||||
|
|
||||||
class Content(BaseIter):
|
class Content(BaseIter):
|
||||||
|
@ -1511,9 +1325,9 @@ class Singular(BaseRenderable):
|
||||||
#self.photo,
|
#self.photo,
|
||||||
#self.content,
|
#self.content,
|
||||||
#)
|
#)
|
||||||
if shared.config.getboolean('params', 'nooffline'):
|
#if shared.config.getboolean('params', 'nooffline'):
|
||||||
return
|
#return
|
||||||
trigger = self.offlinecopies
|
#trigger = self.offlinecopies
|
||||||
|
|
||||||
#def __filter_syndication(self):
|
#def __filter_syndication(self):
|
||||||
#syndications = self.meta.get('syndicate', None)
|
#syndications = self.meta.get('syndicate', None)
|
||||||
|
@ -1680,7 +1494,7 @@ class Singular(BaseRenderable):
|
||||||
'text': 'CC BY 4.0',
|
'text': 'CC BY 4.0',
|
||||||
'description': 'Licensed under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. You are free to share or republish, even if modified, if you link back here and indicate the modifications, even for commercial use.'
|
'description': 'Licensed under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. You are free to share or republish, even if modified, if you link back here and indicate the modifications, even for commercial use.'
|
||||||
}
|
}
|
||||||
if 'journal' == self.category:
|
elif 'journal' == self.category:
|
||||||
l = {
|
l = {
|
||||||
'url': 'https://creativecommons.org/licenses/by-nc/4.0/',
|
'url': 'https://creativecommons.org/licenses/by-nc/4.0/',
|
||||||
'text': 'CC BY-NC 4.0',
|
'text': 'CC BY-NC 4.0',
|
||||||
|
@ -1894,26 +1708,26 @@ class Singular(BaseRenderable):
|
||||||
return self._sumhtml
|
return self._sumhtml
|
||||||
|
|
||||||
|
|
||||||
@property
|
#@property
|
||||||
def offlinecopies(self):
|
#def offlinecopies(self):
|
||||||
# stupidly simple property caching
|
## stupidly simple property caching
|
||||||
if hasattr(self, 'copies'):
|
#if hasattr(self, 'copies'):
|
||||||
return self.copies
|
#return self.copies
|
||||||
|
|
||||||
copies = {}
|
#copies = {}
|
||||||
for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']:
|
#for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']:
|
||||||
maybe = self.meta.get(maybe, False)
|
#maybe = self.meta.get(maybe, False)
|
||||||
if not maybe:
|
#if not maybe:
|
||||||
continue
|
#continue
|
||||||
if not isinstance(maybe, list):
|
#if not isinstance(maybe, list):
|
||||||
maybe = [maybe]
|
#maybe = [maybe]
|
||||||
for url in maybe:
|
#for url in maybe:
|
||||||
arch = OfflineArchive(url)
|
#arch = OfflineArchive(url)
|
||||||
arch.run()
|
#arch.run()
|
||||||
#copies[url] = arch.read()
|
##copies[url] = arch.read()
|
||||||
|
|
||||||
#self.copies = copies
|
##self.copies = copies
|
||||||
#return copies
|
##return copies
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -2157,10 +1971,6 @@ class NASG(object):
|
||||||
for e in [self.content.categories, self.content.tags]:
|
for e in [self.content.categories, self.content.tags]:
|
||||||
for name, t in e.items():
|
for name, t in e.items():
|
||||||
await t.render(self.renderer)
|
await t.render(self.renderer)
|
||||||
if name == 'photo' and t.taxonomy == 'category':
|
|
||||||
await t.grender(self.renderer)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def __afrender(self):
|
async def __afrender(self):
|
||||||
await self.content.front.render(self.renderer)
|
await self.content.front.render(self.renderer)
|
||||||
|
|
39
offlinecopies.py
Normal file
39
offlinecopies.py
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
import glob
|
||||||
|
import shared
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import frontmatter
|
||||||
|
|
||||||
|
# remove the rest of the potential loggers
|
||||||
|
while len(logging.root.handlers) > 0:
|
||||||
|
logging.root.removeHandler(logging.root.handlers[-1])
|
||||||
|
|
||||||
|
# --- set loglevel
|
||||||
|
logging.basicConfig(
|
||||||
|
level=10,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
bookmarks = glob.glob('/web/petermolnar.net/petermolnar.net/content/bookmark/*.md')
|
||||||
|
bm = {}
|
||||||
|
for b in bookmarks:
|
||||||
|
with open(b, 'rt') as f:
|
||||||
|
fm = frontmatter.loads(f.read())
|
||||||
|
if not fm.metadata.get('bookmark-of'):
|
||||||
|
continue
|
||||||
|
bm[b] = fm
|
||||||
|
|
||||||
|
for fname, fm in bm.items():
|
||||||
|
logging.info('dealing with %s', fname)
|
||||||
|
url = fm.metadata.get('bookmark-of')
|
||||||
|
f, ext = os.path.splitext(os.path.basename(fname))
|
||||||
|
p = os.path.join(
|
||||||
|
shared.config.get('source', 'offlinecopiesdir'),
|
||||||
|
f
|
||||||
|
)
|
||||||
|
if os.path.isdir(p):
|
||||||
|
continue
|
||||||
|
|
||||||
|
trueurl = shared.find_archiveorgurl(url)
|
||||||
|
w = shared.wget(trueurl, dirname=f)
|
||||||
|
w.archive()
|
140
shared.py
140
shared.py
|
@ -5,6 +5,8 @@ import glob
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import json
|
import json
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from whoosh import fields
|
from whoosh import fields
|
||||||
from whoosh import analysis
|
from whoosh import analysis
|
||||||
|
@ -250,3 +252,141 @@ class Pandoc(CMDLine):
|
||||||
stderr
|
stderr
|
||||||
)
|
)
|
||||||
return stdout.decode('utf-8').strip()
|
return stdout.decode('utf-8').strip()
|
||||||
|
|
||||||
|
|
||||||
|
class HeadlessChromium(CMDLine):
|
||||||
|
def __init__(self, url):
|
||||||
|
super().__init__('chromium-browser')
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
cmd = (
|
||||||
|
self.executable,
|
||||||
|
'--headless',
|
||||||
|
'--disable-gpu',
|
||||||
|
'--disable-preconnect',
|
||||||
|
'--dump-dom',
|
||||||
|
'--timeout 60',
|
||||||
|
'--save-page-as-mhtml',
|
||||||
|
"%s" % self.url
|
||||||
|
)
|
||||||
|
logging.debug('getting URL %s with headless chrome', self.url)
|
||||||
|
p = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr = p.communicate()
|
||||||
|
if stderr:
|
||||||
|
logging.error(
|
||||||
|
"Error getting URL:\n\t%s\n\t%s",
|
||||||
|
cmd,
|
||||||
|
stderr
|
||||||
|
)
|
||||||
|
return stdout.decode('utf-8').strip()
|
||||||
|
|
||||||
|
|
||||||
|
class wget(CMDLine):
|
||||||
|
def __init__(self, url, dirname=None):
|
||||||
|
super().__init__('wget')
|
||||||
|
self.url = url
|
||||||
|
self.slug = dirname or slugfname(self.url)
|
||||||
|
self.saveto = os.path.join(
|
||||||
|
config.get('source', 'offlinecopiesdir'),
|
||||||
|
self.slug
|
||||||
|
)
|
||||||
|
|
||||||
|
def archive(self):
|
||||||
|
cmd = (
|
||||||
|
self.executable,
|
||||||
|
'-e',
|
||||||
|
'robots=off',
|
||||||
|
'--timeout=360',
|
||||||
|
'--no-clobber',
|
||||||
|
'--no-directories',
|
||||||
|
'--adjust-extension',
|
||||||
|
'--span-hosts',
|
||||||
|
'--wait=1',
|
||||||
|
'--random-wait',
|
||||||
|
'--convert-links',
|
||||||
|
#'--backup-converted',
|
||||||
|
'--page-requisites',
|
||||||
|
'--directory-prefix=%s' % self.saveto,
|
||||||
|
"%s" % self.url
|
||||||
|
)
|
||||||
|
logging.debug('getting URL %s with wget', self.url)
|
||||||
|
p = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr = p.communicate()
|
||||||
|
if stderr:
|
||||||
|
logging.error(
|
||||||
|
"Error getting URL:\n\t%s\n\t%s",
|
||||||
|
cmd,
|
||||||
|
stderr
|
||||||
|
)
|
||||||
|
return stdout.decode('utf-8').strip()
|
||||||
|
|
||||||
|
def find_realurl(url):
|
||||||
|
headers = requests.utils.default_headers()
|
||||||
|
headers.update({
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||||
|
})
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(
|
||||||
|
url,
|
||||||
|
allow_redirects=True,
|
||||||
|
timeout=60,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error('getting real url failed: %s', e)
|
||||||
|
return (None, 400)
|
||||||
|
|
||||||
|
finalurl = list(urlparse(r.url))
|
||||||
|
finalurl[4] = '&'.join(
|
||||||
|
[x for x in finalurl[4].split('&') if not x.startswith('utm_')])
|
||||||
|
finalurl = urlunparse(finalurl)
|
||||||
|
|
||||||
|
return (finalurl, r.status_code)
|
||||||
|
|
||||||
|
def find_archiveorgurl(url):
|
||||||
|
url, status = find_realurl(url)
|
||||||
|
if status == requests.codes.ok:
|
||||||
|
return url
|
||||||
|
|
||||||
|
try:
|
||||||
|
a = requests.get(
|
||||||
|
"http://archive.org/wayback/available?url=%s" % url,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error('Failed to fetch archive.org availability for %s' % url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not a:
|
||||||
|
logging.error('empty archive.org availability for %s' % url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
a = json.loads(a.text)
|
||||||
|
aurl = a.get(
|
||||||
|
'archived_snapshots', {}
|
||||||
|
).get(
|
||||||
|
'closest', {}
|
||||||
|
).get(
|
||||||
|
'url', None
|
||||||
|
)
|
||||||
|
if aurl:
|
||||||
|
logging.debug("found %s in archive.org for %s", aurl, url)
|
||||||
|
return aurl
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("archive.org parsing failed: %s", e)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
Loading…
Reference in a new issue