offlinecopy moved to wget instead of python requests
This commit is contained in:
parent
185e9d200c
commit
c7ab932ac7
3 changed files with 386 additions and 397 deletions
596
nasg.py
596
nasg.py
|
@ -210,254 +210,6 @@ class Indexer(object):
|
|||
self.writer.commit()
|
||||
|
||||
|
||||
class OfflineArchive(object):
|
||||
# keep in mind that these are frontmattered HTML files with full HTML and embedded images
|
||||
# they can get VERY large
|
||||
def __init__(self, url, content=None, decode_email=False):
|
||||
self.url = url
|
||||
self.parsed = urllib.parse.urlparse(url)
|
||||
self.fbase = shared.slugfname(url)
|
||||
self.fname = "%s.md" % self.fbase
|
||||
self.target = os.path.join(
|
||||
shared.config.get('source', 'offlinecopiesdir'),
|
||||
self.fname
|
||||
)
|
||||
self.targetd = os.path.join(
|
||||
shared.config.get('source', 'offlinecopiesdir'),
|
||||
self.fbase
|
||||
)
|
||||
if not os.path.isdir(self.targetd):
|
||||
os.mkdir(self.targetd)
|
||||
|
||||
self.fm = frontmatter.loads('')
|
||||
self.fm.metadata = {
|
||||
'url': self.url,
|
||||
'date': arrow.utcnow().format("YYYY-MM-DDTHH:mm:ssZ"),
|
||||
}
|
||||
self.headers = requests.utils.default_headers()
|
||||
self.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||
})
|
||||
|
||||
self.skip_fetch = False
|
||||
if content:
|
||||
self.skip_fetch = True
|
||||
if decode_email:
|
||||
content = quopri.decodestring(content)
|
||||
content = str(content, 'utf-8', errors='replace')
|
||||
self.fm.content = content
|
||||
#self.tmp = tempfile.mkdtemp(
|
||||
#'offlinearchive_',
|
||||
#dir=tempfile.gettempdir()
|
||||
#)
|
||||
#atexit.register(
|
||||
#shutil.rmtree,
|
||||
#os.path.abspath(self.tmp)
|
||||
#)
|
||||
#self.images = []
|
||||
|
||||
self.exists = os.path.isfile(self.target)
|
||||
|
||||
#def read(self):
|
||||
#if not self.exists:
|
||||
#return ''
|
||||
|
||||
#with open(self.target, 'rt') as f:
|
||||
#self.fm = frontmatter.loads(f.read())
|
||||
|
||||
#readable = ''
|
||||
#try:
|
||||
#readable = Document(self.fm.content)
|
||||
#readable = shared.Pandoc(False).convert(readable.summary())
|
||||
#readable = shared.Pandoc().convert(readable)
|
||||
#except Exception as e:
|
||||
#logging.error('Failed to readable %s', self.target)
|
||||
|
||||
#return readable
|
||||
|
||||
|
||||
def _getimage(self, src):
|
||||
imgname, imgext = os.path.splitext(os.path.basename(src))
|
||||
imgtarget = os.path.join(
|
||||
self.targetd,
|
||||
"%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext)
|
||||
)
|
||||
try:
|
||||
logging.debug('donwloading image %s', src)
|
||||
r = requests.get(
|
||||
src,
|
||||
allow_redirects=True,
|
||||
timeout=60,
|
||||
stream=True
|
||||
)
|
||||
with open(imgtarget, 'wb') as f:
|
||||
for chunk in r.iter_content():
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
self.fm.content = self.fm.content.replace(
|
||||
src,
|
||||
'%s/%s' % (self.fbase, imgname)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error('pulling image %s failed: %s', src, e)
|
||||
return
|
||||
|
||||
def _get_images(self):
|
||||
logging.debug("trying to save images")
|
||||
soup = BeautifulSoup(self.fm.content, 'lxml')
|
||||
|
||||
embedded = re.compile(r'^data:.*')
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src')
|
||||
if not src:
|
||||
continue
|
||||
if embedded.match(src):
|
||||
continue
|
||||
|
||||
im = urllib.parse.urlparse(src)
|
||||
if not im.scheme:
|
||||
im = im._replace(scheme=self.parsed.scheme)
|
||||
if not im.netloc:
|
||||
im = im._replace(netloc=self.parsed.netloc)
|
||||
|
||||
self._getimage(im.geturl())
|
||||
|
||||
|
||||
#def _getimage(self, src):
|
||||
#tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200])
|
||||
#try:
|
||||
#r = requests.get(
|
||||
#src,
|
||||
#allow_redirects=True,
|
||||
#timeout=60,
|
||||
#stream=True
|
||||
#)
|
||||
#with open(tmp, 'wb') as f:
|
||||
#for chunk in r.iter_content():
|
||||
#if chunk:
|
||||
#f.write(chunk)
|
||||
|
||||
#logging.debug('trying to embed %s', src)
|
||||
#with open(tmp, 'rb') as imgdata:
|
||||
#data = str(base64.b64encode(imgdata.read()), 'ascii')
|
||||
#mimetype, encoding = mimetypes.guess_type(tmp)
|
||||
#self.fm.content = self.fm.content.replace(
|
||||
#src,
|
||||
#"data:%s;base64,%s" % (mimetype, data)
|
||||
#)
|
||||
#except Exception as e:
|
||||
#logging.error('pulling image %s failed: %s', src, e)
|
||||
#return
|
||||
|
||||
#def _embed_images(self):
|
||||
#logging.debug("trying to embed images")
|
||||
#soup = BeautifulSoup(self.fm.content, 'lxml')
|
||||
|
||||
#embedded = re.compile(r'^data:.*')
|
||||
#for img in soup.find_all('img'):
|
||||
#src = img.get('src')
|
||||
#if not src:
|
||||
#continue
|
||||
#if embedded.match(src):
|
||||
#continue
|
||||
|
||||
#im = urllib.parse.urlparse(src)
|
||||
#if not im.scheme:
|
||||
#im = im._replace(scheme=self.parsed.scheme)
|
||||
#if not im.netloc:
|
||||
#im = im._replace(netloc=self.parsed.netloc)
|
||||
|
||||
#self._getimage(im.geturl())
|
||||
|
||||
|
||||
def save(self):
|
||||
logging.info(
|
||||
"savig offline copy of\n\t%s to:\n\t%s",
|
||||
self.url,
|
||||
self.target
|
||||
)
|
||||
|
||||
with open(self.target, 'wt') as f:
|
||||
f.write(frontmatter.dumps(self.fm))
|
||||
|
||||
@property
|
||||
def archiveorgurl(self):
|
||||
logging.debug("trying archive.org for %s", self.url)
|
||||
a = self.fetch(
|
||||
"http://archive.org/wayback/available?url=%s" % self.url,
|
||||
)
|
||||
if not a:
|
||||
logging.debug("no entry for %s on archive.org", self.url)
|
||||
return None
|
||||
|
||||
try:
|
||||
a = json.loads(a.text)
|
||||
aurl = a.get(
|
||||
'archived_snapshots', {}
|
||||
).get(
|
||||
'closest', {}
|
||||
).get(
|
||||
'url', None
|
||||
)
|
||||
logging.debug("found %s in archive.org for %s", aurl, self.url)
|
||||
self.updateurl(aurl)
|
||||
return self.fetch(aurl)
|
||||
except Exception as e:
|
||||
logging.error("archive.org parsing failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch(self, url):
|
||||
try:
|
||||
r = requests.get(
|
||||
self.url,
|
||||
allow_redirects=True,
|
||||
timeout=60,
|
||||
headers=self.headers
|
||||
)
|
||||
if r.status_code == requests.codes.ok:
|
||||
return r
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
#def read():
|
||||
#if os.path.isfile(self.target):
|
||||
#with open(self.target) as f:
|
||||
#self.fm = frontmatter.loads(f.read())
|
||||
#return
|
||||
|
||||
|
||||
def run(self):
|
||||
if self.exists:
|
||||
logging.info("offline archive for %s already exists", self.url)
|
||||
return
|
||||
|
||||
logging.info("prepairing offline copy of %s", self.url)
|
||||
|
||||
if not self.skip_fetch:
|
||||
r = self.fetch(self.url)
|
||||
|
||||
# in case it's not, try to look for an archive.org url:
|
||||
if not r:
|
||||
logging.warning("couldn't get live version of %s, trying archive.org", self.url)
|
||||
r = self.fetch(self.archiveorgurl)
|
||||
|
||||
# no live and no archive.org entry :((
|
||||
# howver, by miracle, I may already have a copy, so skip if it's there already
|
||||
if not r:
|
||||
logging.error("no live or archive version of %s found :((", self.url)
|
||||
if not self.exists:
|
||||
self.save()
|
||||
return
|
||||
|
||||
self.fm.content = r.text
|
||||
|
||||
self._get_images()
|
||||
self.save()
|
||||
|
||||
|
||||
class Renderer(object):
|
||||
def __init__(self):
|
||||
self.sitevars = dict(shared.config.items('site'))
|
||||
|
@ -604,8 +356,8 @@ class Comment(BaseRenderable):
|
|||
self._tmplvars = {
|
||||
'published': self.published.datetime,
|
||||
'author': self.meta.get('author', {}),
|
||||
'content': self.content,
|
||||
'html': self.html,
|
||||
#'content': self.content,
|
||||
#'html': self.html,
|
||||
'source': self.source,
|
||||
'target': self.targeturl,
|
||||
'type': self.meta.get('type', 'webmention'),
|
||||
|
@ -795,8 +547,9 @@ class WebImage(object):
|
|||
shared.config.get('target', 'filesdir'),
|
||||
fname
|
||||
),
|
||||
'url': "%s/%s/%s" % (
|
||||
shared.config.get('site', 'url'),
|
||||
'url': "/%s/%s" % (
|
||||
#'url': "%s/%s/%s" % (
|
||||
#shared.config.get('site', 'url'),
|
||||
shared.config.get('source', 'files'),
|
||||
fname
|
||||
),
|
||||
|
@ -812,8 +565,9 @@ class WebImage(object):
|
|||
self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url']
|
||||
self.target = self.sizes[-1][1]['url']
|
||||
else:
|
||||
self.small = self.fallback = "%s/%s/%s" % (
|
||||
shared.config.get('site', 'url'),
|
||||
self.small = self.fallback = "/%s/%s" % (
|
||||
#self.small = self.fallback = "%s/%s/%s" % (
|
||||
#shared.config.get('site', 'url'),
|
||||
shared.config.get('source', 'files'),
|
||||
"%s%s" % (self.fname, self.ext)
|
||||
)
|
||||
|
@ -1129,12 +883,12 @@ class Taxonomy(BaseIter):
|
|||
self.taxonomy = taxonomy
|
||||
|
||||
|
||||
@property
|
||||
def pages(self):
|
||||
if hasattr(self, '_pages'):
|
||||
return self._pages
|
||||
self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination'))
|
||||
return self._pages
|
||||
#@property
|
||||
#def pages(self):
|
||||
#if hasattr(self, '_pages'):
|
||||
#return self._pages
|
||||
#self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination'))
|
||||
#return self._pages
|
||||
|
||||
def __repr__(self):
|
||||
return "taxonomy %s with %d items" % (self.taxonomy, len(self.data))
|
||||
|
@ -1184,71 +938,48 @@ class Taxonomy(BaseIter):
|
|||
|
||||
def __mkdirs(self):
|
||||
check = [self.basep, self.myp, self.feedp]
|
||||
|
||||
if self.pages > 1:
|
||||
check.append(self.pagep)
|
||||
for i in range(2, self.pages+1):
|
||||
subpagep = os.path.abspath(os.path.join(
|
||||
self.pagep,
|
||||
'%d' % i
|
||||
))
|
||||
check.append(subpagep)
|
||||
|
||||
for p in check:
|
||||
if not os.path.isdir(p):
|
||||
logging.debug("creating dir %s", p)
|
||||
os.mkdir(p)
|
||||
|
||||
|
||||
def tpath(self, page):
|
||||
if page == 1:
|
||||
return "%s/index.html" % (self.myp)
|
||||
p = "%s" % (self.myp)
|
||||
else:
|
||||
return "%s/%d/index.html" % (self.pagep, page)
|
||||
p = os.path.join(self.pagep, "%d" % page)
|
||||
|
||||
if not os.path.isdir(p):
|
||||
logging.debug("creating dir %s", p)
|
||||
os.mkdir(p)
|
||||
|
||||
async def grender(self, renderer):
|
||||
#if not self.slug or self.slug is 'None':
|
||||
#return
|
||||
return os.path.join(p, "index.html")
|
||||
|
||||
self.__mkdirs()
|
||||
target = self.tpath(1)
|
||||
target = target.replace('index', 'gallery')
|
||||
@property
|
||||
def is_singlepage(self):
|
||||
spcats = shared.config.get('common', 'onepagecategories').split(',')
|
||||
if self.name in spcats and 'category' == self.taxonomy:
|
||||
return True
|
||||
return False
|
||||
|
||||
if not shared.config.getboolean('params', 'force') and os.path.isfile(target):
|
||||
ttime = int(os.path.getmtime(target))
|
||||
mtime = self.mtime
|
||||
if ttime == mtime:
|
||||
logging.info('taxonomy index for "%s" exists and up-to-date (lastmod: %d)', self.slug, ttime)
|
||||
return
|
||||
else:
|
||||
logging.info('taxonomy update needed: %s timestamp is %d, last post timestamp is %d (%s)',
|
||||
target,
|
||||
ttime,
|
||||
mtime,
|
||||
self.data[mtime].fname
|
||||
)
|
||||
def posttmpls(self, order='time', start=0, end=None):
|
||||
end = end or len(self.data)
|
||||
if 'all' == order:
|
||||
return [
|
||||
i.tmplvars
|
||||
for k, i in list(sorted(
|
||||
self.data.items(),
|
||||
key=lambda value: value[1].title.lower()
|
||||
))
|
||||
]
|
||||
|
||||
posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
||||
self.data.keys(), reverse=True))]
|
||||
|
||||
logging.info("rendering gallery to %s", target)
|
||||
tmplvars = {
|
||||
'taxonomy': {
|
||||
'url': self.baseurl,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
'taxonomy': self.taxonomy,
|
||||
'lastmod': arrow.get(self.mtime).datetime
|
||||
},
|
||||
'site': renderer.sitevars,
|
||||
'posts': posttmpls,
|
||||
}
|
||||
|
||||
r = renderer.j2.get_template('gallery.html').render(tmplvars)
|
||||
with open(target, "wt") as html:
|
||||
html.write(r)
|
||||
os.utime(target, (self.mtime, self.mtime))
|
||||
return [
|
||||
self.data[k].tmplvars
|
||||
for k in list(sorted(
|
||||
self.data.keys(),
|
||||
reverse=True
|
||||
))[start:end]
|
||||
]
|
||||
|
||||
|
||||
async def render(self, renderer):
|
||||
|
@ -1272,20 +1003,59 @@ class Taxonomy(BaseIter):
|
|||
self.data[mtime].fname
|
||||
)
|
||||
|
||||
while page <= self.pages:
|
||||
self.renderpage(renderer, page)
|
||||
if self.is_singlepage:
|
||||
pagination = len(self.data)
|
||||
else:
|
||||
pagination = shared.config.getint('common', 'pagination')
|
||||
pages = math.ceil(len(self.data) / pagination)
|
||||
|
||||
while page <= pages:
|
||||
self.render_page(renderer, page, pagination, pages)
|
||||
page = page+1
|
||||
|
||||
self.render_feeds(renderer)
|
||||
self.ping_websub
|
||||
|
||||
def renderpage(self, renderer, page):
|
||||
pagination = int(shared.config.get('common', 'pagination'))
|
||||
|
||||
def render_feeds(self, renderer):
|
||||
pagination = shared.config.getint('common', 'pagination')
|
||||
start = 0
|
||||
end = int(start + pagination)
|
||||
posttmpls = self.posttmpls('time', start, end)
|
||||
tmplvars = {
|
||||
'taxonomy': {
|
||||
'url': self.baseurl,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
'taxonomy': self.taxonomy,
|
||||
'lastmod': arrow.get(self.mtime).datetime
|
||||
},
|
||||
'site': renderer.sitevars,
|
||||
'posts': posttmpls,
|
||||
}
|
||||
|
||||
target = os.path.join(self.feedp, 'index.atom')
|
||||
logging.info("rendering Atom feed to %s", target)
|
||||
r = renderer.j2.get_template('atom.html').render(tmplvars)
|
||||
with open(target, "wt") as html:
|
||||
html.write(r)
|
||||
os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
|
||||
def render_page(self, renderer, page, pagination, pages):
|
||||
if self.is_singlepage:
|
||||
posttmpls = self.posttmpls('all')
|
||||
else:
|
||||
start = int((page-1) * pagination)
|
||||
end = int(start + pagination)
|
||||
|
||||
posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
||||
self.data.keys(), reverse=True))[start:end]]
|
||||
posttmpls = self.posttmpls('time', start, end)
|
||||
|
||||
target = self.tpath(page)
|
||||
tdir = os.path.dirname(target)
|
||||
if not os.path.isdir(tdir):
|
||||
logging.debug("creating dir %s", tdir)
|
||||
os.mkdir(tdir)
|
||||
|
||||
logging.info("rendering taxonomy page %d to %s", page, target)
|
||||
tmplvars = {
|
||||
'taxonomy': {
|
||||
|
@ -1294,7 +1064,7 @@ class Taxonomy(BaseIter):
|
|||
'slug': self.slug,
|
||||
'taxonomy': self.taxonomy,
|
||||
'paged': page,
|
||||
'total': self.pages,
|
||||
'total': pages,
|
||||
'perpage': pagination,
|
||||
'lastmod': arrow.get(self.mtime).datetime
|
||||
},
|
||||
|
@ -1307,54 +1077,8 @@ class Taxonomy(BaseIter):
|
|||
html.write(r)
|
||||
os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
if 1 == page:
|
||||
#target = os.path.join(self.feedp, 'index.rss')
|
||||
#logging.info("rendering RSS feed to %s", target)
|
||||
#r = renderer.j2.get_template('rss.html').render(tmplvars)
|
||||
#with open(target, "wt") as html:
|
||||
#html.write(r)
|
||||
#os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
target = os.path.join(self.feedp, 'index.atom')
|
||||
logging.info("rendering Atom feed to %s", target)
|
||||
r = renderer.j2.get_template('atom.html').render(tmplvars)
|
||||
with open(target, "wt") as html:
|
||||
html.write(r)
|
||||
os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
# ---
|
||||
# this is a joke
|
||||
# see http://indieweb.org/YAMLFeed
|
||||
# don't do YAMLFeeds.
|
||||
if 1 == page:
|
||||
fm = frontmatter.loads('')
|
||||
fm.metadata = {
|
||||
'site': {
|
||||
'author': renderer.sitevars['author'],
|
||||
'url': renderer.sitevars['url'],
|
||||
'title': renderer.sitevars['title'],
|
||||
},
|
||||
'items': [],
|
||||
}
|
||||
|
||||
for p in posttmpls:
|
||||
fm.metadata['items'].append({
|
||||
'title': p['title'],
|
||||
'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']),
|
||||
'content': p['content'],
|
||||
'summary': p['summary'],
|
||||
'published': p['published'],
|
||||
'updated': p['updated'],
|
||||
})
|
||||
|
||||
target = os.path.join(self.feedp, 'index.yml')
|
||||
logging.info("rendering YAML feed to %s", target)
|
||||
with open(target, "wt") as html:
|
||||
html.write(frontmatter.dumps(fm))
|
||||
os.utime(target, (self.mtime, self.mtime))
|
||||
# ---
|
||||
|
||||
if 1 == page:
|
||||
def ping_websub(self):
|
||||
if not self.taxonomy or self.taxonomy == 'category':
|
||||
t = shared.config.get('site', 'websuburl')
|
||||
data = {
|
||||
|
@ -1367,6 +1091,96 @@ class Taxonomy(BaseIter):
|
|||
requests.post(t, data=data)
|
||||
|
||||
|
||||
#def renderpage(self, renderer, page):
|
||||
#pagination = int(shared.config.get('common', 'pagination'))
|
||||
#start = int((page-1) * pagination)
|
||||
#end = int(start + pagination)
|
||||
|
||||
#posttmpls = [self.data[k].tmplvars for k in list(sorted(
|
||||
#self.data.keys(), reverse=True))[start:end]]
|
||||
|
||||
#target = self.tpath(page)
|
||||
#logging.info("rendering taxonomy page %d to %s", page, target)
|
||||
#tmplvars = {
|
||||
#'taxonomy': {
|
||||
#'url': self.baseurl,
|
||||
#'name': self.name,
|
||||
#'slug': self.slug,
|
||||
#'taxonomy': self.taxonomy,
|
||||
#'paged': page,
|
||||
#'total': self.pages,
|
||||
#'perpage': pagination,
|
||||
#'lastmod': arrow.get(self.mtime).datetime
|
||||
#},
|
||||
#'site': renderer.sitevars,
|
||||
#'posts': posttmpls,
|
||||
#}
|
||||
|
||||
#r = renderer.j2.get_template('archive.html').render(tmplvars)
|
||||
#with open(target, "wt") as html:
|
||||
#html.write(r)
|
||||
#os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
#if 1 == page:
|
||||
##target = os.path.join(self.feedp, 'index.rss')
|
||||
##logging.info("rendering RSS feed to %s", target)
|
||||
##r = renderer.j2.get_template('rss.html').render(tmplvars)
|
||||
##with open(target, "wt") as html:
|
||||
##html.write(r)
|
||||
##os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
#target = os.path.join(self.feedp, 'index.atom')
|
||||
#logging.info("rendering Atom feed to %s", target)
|
||||
#r = renderer.j2.get_template('atom.html').render(tmplvars)
|
||||
#with open(target, "wt") as html:
|
||||
#html.write(r)
|
||||
#os.utime(target, (self.mtime, self.mtime))
|
||||
|
||||
## ---
|
||||
## this is a joke
|
||||
## see http://indieweb.org/YAMLFeed
|
||||
## don't do YAMLFeeds.
|
||||
#if 1 == page:
|
||||
#fm = frontmatter.loads('')
|
||||
#fm.metadata = {
|
||||
#'site': {
|
||||
#'author': renderer.sitevars['author'],
|
||||
#'url': renderer.sitevars['url'],
|
||||
#'title': renderer.sitevars['title'],
|
||||
#},
|
||||
#'items': [],
|
||||
#}
|
||||
|
||||
#for p in posttmpls:
|
||||
#fm.metadata['items'].append({
|
||||
#'title': p['title'],
|
||||
#'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']),
|
||||
#'content': p['content'],
|
||||
#'summary': p['summary'],
|
||||
#'published': p['published'],
|
||||
#'updated': p['updated'],
|
||||
#})
|
||||
|
||||
#target = os.path.join(self.feedp, 'index.yml')
|
||||
#logging.info("rendering YAML feed to %s", target)
|
||||
#with open(target, "wt") as html:
|
||||
#html.write(frontmatter.dumps(fm))
|
||||
#os.utime(target, (self.mtime, self.mtime))
|
||||
## ---
|
||||
|
||||
#if 1 == page:
|
||||
#if not self.taxonomy or self.taxonomy == 'category':
|
||||
#t = shared.config.get('site', 'websuburl')
|
||||
#data = {
|
||||
#'hub.mode': 'publish',
|
||||
#'hub.url': "%s%s" % (
|
||||
#shared.config.get('site', 'url'), self.baseurl
|
||||
#)
|
||||
#}
|
||||
#logging.info("pinging %s with data %s", t, data)
|
||||
#requests.post(t, data=data)
|
||||
|
||||
|
||||
class Content(BaseIter):
|
||||
def __init__(self, images, comments, extensions=['md']):
|
||||
super(Content, self).__init__()
|
||||
|
@ -1511,9 +1325,9 @@ class Singular(BaseRenderable):
|
|||
#self.photo,
|
||||
#self.content,
|
||||
#)
|
||||
if shared.config.getboolean('params', 'nooffline'):
|
||||
return
|
||||
trigger = self.offlinecopies
|
||||
#if shared.config.getboolean('params', 'nooffline'):
|
||||
#return
|
||||
#trigger = self.offlinecopies
|
||||
|
||||
#def __filter_syndication(self):
|
||||
#syndications = self.meta.get('syndicate', None)
|
||||
|
@ -1680,7 +1494,7 @@ class Singular(BaseRenderable):
|
|||
'text': 'CC BY 4.0',
|
||||
'description': 'Licensed under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. You are free to share or republish, even if modified, if you link back here and indicate the modifications, even for commercial use.'
|
||||
}
|
||||
if 'journal' == self.category:
|
||||
elif 'journal' == self.category:
|
||||
l = {
|
||||
'url': 'https://creativecommons.org/licenses/by-nc/4.0/',
|
||||
'text': 'CC BY-NC 4.0',
|
||||
|
@ -1894,26 +1708,26 @@ class Singular(BaseRenderable):
|
|||
return self._sumhtml
|
||||
|
||||
|
||||
@property
|
||||
def offlinecopies(self):
|
||||
# stupidly simple property caching
|
||||
if hasattr(self, 'copies'):
|
||||
return self.copies
|
||||
#@property
|
||||
#def offlinecopies(self):
|
||||
## stupidly simple property caching
|
||||
#if hasattr(self, 'copies'):
|
||||
#return self.copies
|
||||
|
||||
copies = {}
|
||||
for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']:
|
||||
maybe = self.meta.get(maybe, False)
|
||||
if not maybe:
|
||||
continue
|
||||
if not isinstance(maybe, list):
|
||||
maybe = [maybe]
|
||||
for url in maybe:
|
||||
arch = OfflineArchive(url)
|
||||
arch.run()
|
||||
#copies[url] = arch.read()
|
||||
#copies = {}
|
||||
#for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']:
|
||||
#maybe = self.meta.get(maybe, False)
|
||||
#if not maybe:
|
||||
#continue
|
||||
#if not isinstance(maybe, list):
|
||||
#maybe = [maybe]
|
||||
#for url in maybe:
|
||||
#arch = OfflineArchive(url)
|
||||
#arch.run()
|
||||
##copies[url] = arch.read()
|
||||
|
||||
#self.copies = copies
|
||||
#return copies
|
||||
##self.copies = copies
|
||||
##return copies
|
||||
|
||||
|
||||
@property
|
||||
|
@ -2157,10 +1971,6 @@ class NASG(object):
|
|||
for e in [self.content.categories, self.content.tags]:
|
||||
for name, t in e.items():
|
||||
await t.render(self.renderer)
|
||||
if name == 'photo' and t.taxonomy == 'category':
|
||||
await t.grender(self.renderer)
|
||||
|
||||
|
||||
|
||||
async def __afrender(self):
|
||||
await self.content.front.render(self.renderer)
|
||||
|
|
39
offlinecopies.py
Normal file
39
offlinecopies.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
import glob
|
||||
import shared
|
||||
import os
|
||||
import logging
|
||||
import frontmatter
|
||||
|
||||
# remove the rest of the potential loggers
|
||||
while len(logging.root.handlers) > 0:
|
||||
logging.root.removeHandler(logging.root.handlers[-1])
|
||||
|
||||
# --- set loglevel
|
||||
logging.basicConfig(
|
||||
level=10,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
bookmarks = glob.glob('/web/petermolnar.net/petermolnar.net/content/bookmark/*.md')
|
||||
bm = {}
|
||||
for b in bookmarks:
|
||||
with open(b, 'rt') as f:
|
||||
fm = frontmatter.loads(f.read())
|
||||
if not fm.metadata.get('bookmark-of'):
|
||||
continue
|
||||
bm[b] = fm
|
||||
|
||||
for fname, fm in bm.items():
|
||||
logging.info('dealing with %s', fname)
|
||||
url = fm.metadata.get('bookmark-of')
|
||||
f, ext = os.path.splitext(os.path.basename(fname))
|
||||
p = os.path.join(
|
||||
shared.config.get('source', 'offlinecopiesdir'),
|
||||
f
|
||||
)
|
||||
if os.path.isdir(p):
|
||||
continue
|
||||
|
||||
trueurl = shared.find_archiveorgurl(url)
|
||||
w = shared.wget(trueurl, dirname=f)
|
||||
w.archive()
|
140
shared.py
140
shared.py
|
@ -5,6 +5,8 @@ import glob
|
|||
import logging
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from whoosh import fields
|
||||
from whoosh import analysis
|
||||
|
@ -250,3 +252,141 @@ class Pandoc(CMDLine):
|
|||
stderr
|
||||
)
|
||||
return stdout.decode('utf-8').strip()
|
||||
|
||||
|
||||
class HeadlessChromium(CMDLine):
|
||||
def __init__(self, url):
|
||||
super().__init__('chromium-browser')
|
||||
self.url = url
|
||||
|
||||
def get(self):
|
||||
cmd = (
|
||||
self.executable,
|
||||
'--headless',
|
||||
'--disable-gpu',
|
||||
'--disable-preconnect',
|
||||
'--dump-dom',
|
||||
'--timeout 60',
|
||||
'--save-page-as-mhtml',
|
||||
"%s" % self.url
|
||||
)
|
||||
logging.debug('getting URL %s with headless chrome', self.url)
|
||||
p = subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
stdout, stderr = p.communicate()
|
||||
if stderr:
|
||||
logging.error(
|
||||
"Error getting URL:\n\t%s\n\t%s",
|
||||
cmd,
|
||||
stderr
|
||||
)
|
||||
return stdout.decode('utf-8').strip()
|
||||
|
||||
|
||||
class wget(CMDLine):
|
||||
def __init__(self, url, dirname=None):
|
||||
super().__init__('wget')
|
||||
self.url = url
|
||||
self.slug = dirname or slugfname(self.url)
|
||||
self.saveto = os.path.join(
|
||||
config.get('source', 'offlinecopiesdir'),
|
||||
self.slug
|
||||
)
|
||||
|
||||
def archive(self):
|
||||
cmd = (
|
||||
self.executable,
|
||||
'-e',
|
||||
'robots=off',
|
||||
'--timeout=360',
|
||||
'--no-clobber',
|
||||
'--no-directories',
|
||||
'--adjust-extension',
|
||||
'--span-hosts',
|
||||
'--wait=1',
|
||||
'--random-wait',
|
||||
'--convert-links',
|
||||
#'--backup-converted',
|
||||
'--page-requisites',
|
||||
'--directory-prefix=%s' % self.saveto,
|
||||
"%s" % self.url
|
||||
)
|
||||
logging.debug('getting URL %s with wget', self.url)
|
||||
p = subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
|
||||
stdout, stderr = p.communicate()
|
||||
if stderr:
|
||||
logging.error(
|
||||
"Error getting URL:\n\t%s\n\t%s",
|
||||
cmd,
|
||||
stderr
|
||||
)
|
||||
return stdout.decode('utf-8').strip()
|
||||
|
||||
def find_realurl(url):
|
||||
headers = requests.utils.default_headers()
|
||||
headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
|
||||
})
|
||||
|
||||
try:
|
||||
r = requests.get(
|
||||
url,
|
||||
allow_redirects=True,
|
||||
timeout=60,
|
||||
headers=headers
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error('getting real url failed: %s', e)
|
||||
return (None, 400)
|
||||
|
||||
finalurl = list(urlparse(r.url))
|
||||
finalurl[4] = '&'.join(
|
||||
[x for x in finalurl[4].split('&') if not x.startswith('utm_')])
|
||||
finalurl = urlunparse(finalurl)
|
||||
|
||||
return (finalurl, r.status_code)
|
||||
|
||||
def find_archiveorgurl(url):
|
||||
url, status = find_realurl(url)
|
||||
if status == requests.codes.ok:
|
||||
return url
|
||||
|
||||
try:
|
||||
a = requests.get(
|
||||
"http://archive.org/wayback/available?url=%s" % url,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error('Failed to fetch archive.org availability for %s' % url)
|
||||
return None
|
||||
|
||||
if not a:
|
||||
logging.error('empty archive.org availability for %s' % url)
|
||||
return None
|
||||
|
||||
try:
|
||||
a = json.loads(a.text)
|
||||
aurl = a.get(
|
||||
'archived_snapshots', {}
|
||||
).get(
|
||||
'closest', {}
|
||||
).get(
|
||||
'url', None
|
||||
)
|
||||
if aurl:
|
||||
logging.debug("found %s in archive.org for %s", aurl, url)
|
||||
return aurl
|
||||
except Exception as e:
|
||||
logging.error("archive.org parsing failed: %s", e)
|
||||
|
||||
return None
|
||||
|
|
Loading…
Reference in a new issue