offlinecopy moved to wget instead of python requests

This commit is contained in:
Peter Molnar 2017-09-04 12:53:59 +00:00
parent 185e9d200c
commit c7ab932ac7
3 changed files with 386 additions and 397 deletions

604
nasg.py
View file

@ -210,254 +210,6 @@ class Indexer(object):
self.writer.commit() self.writer.commit()
class OfflineArchive(object):
# keep in mind that these are frontmattered HTML files with full HTML and embedded images
# they can get VERY large
def __init__(self, url, content=None, decode_email=False):
self.url = url
self.parsed = urllib.parse.urlparse(url)
self.fbase = shared.slugfname(url)
self.fname = "%s.md" % self.fbase
self.target = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
self.fname
)
self.targetd = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
self.fbase
)
if not os.path.isdir(self.targetd):
os.mkdir(self.targetd)
self.fm = frontmatter.loads('')
self.fm.metadata = {
'url': self.url,
'date': arrow.utcnow().format("YYYY-MM-DDTHH:mm:ssZ"),
}
self.headers = requests.utils.default_headers()
self.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
self.skip_fetch = False
if content:
self.skip_fetch = True
if decode_email:
content = quopri.decodestring(content)
content = str(content, 'utf-8', errors='replace')
self.fm.content = content
#self.tmp = tempfile.mkdtemp(
#'offlinearchive_',
#dir=tempfile.gettempdir()
#)
#atexit.register(
#shutil.rmtree,
#os.path.abspath(self.tmp)
#)
#self.images = []
self.exists = os.path.isfile(self.target)
#def read(self):
#if not self.exists:
#return ''
#with open(self.target, 'rt') as f:
#self.fm = frontmatter.loads(f.read())
#readable = ''
#try:
#readable = Document(self.fm.content)
#readable = shared.Pandoc(False).convert(readable.summary())
#readable = shared.Pandoc().convert(readable)
#except Exception as e:
#logging.error('Failed to readable %s', self.target)
#return readable
def _getimage(self, src):
imgname, imgext = os.path.splitext(os.path.basename(src))
imgtarget = os.path.join(
self.targetd,
"%s%s" % (slugify(imgname, only_ascii=True, lower=True), imgext)
)
try:
logging.debug('donwloading image %s', src)
r = requests.get(
src,
allow_redirects=True,
timeout=60,
stream=True
)
with open(imgtarget, 'wb') as f:
for chunk in r.iter_content():
if chunk:
f.write(chunk)
self.fm.content = self.fm.content.replace(
src,
'%s/%s' % (self.fbase, imgname)
)
except Exception as e:
logging.error('pulling image %s failed: %s', src, e)
return
def _get_images(self):
logging.debug("trying to save images")
soup = BeautifulSoup(self.fm.content, 'lxml')
embedded = re.compile(r'^data:.*')
for img in soup.find_all('img'):
src = img.get('src')
if not src:
continue
if embedded.match(src):
continue
im = urllib.parse.urlparse(src)
if not im.scheme:
im = im._replace(scheme=self.parsed.scheme)
if not im.netloc:
im = im._replace(netloc=self.parsed.netloc)
self._getimage(im.geturl())
#def _getimage(self, src):
#tmp = os.path.join(self.tmp, "%s" % slugify(os.path.basename(src))[:200])
#try:
#r = requests.get(
#src,
#allow_redirects=True,
#timeout=60,
#stream=True
#)
#with open(tmp, 'wb') as f:
#for chunk in r.iter_content():
#if chunk:
#f.write(chunk)
#logging.debug('trying to embed %s', src)
#with open(tmp, 'rb') as imgdata:
#data = str(base64.b64encode(imgdata.read()), 'ascii')
#mimetype, encoding = mimetypes.guess_type(tmp)
#self.fm.content = self.fm.content.replace(
#src,
#"data:%s;base64,%s" % (mimetype, data)
#)
#except Exception as e:
#logging.error('pulling image %s failed: %s', src, e)
#return
#def _embed_images(self):
#logging.debug("trying to embed images")
#soup = BeautifulSoup(self.fm.content, 'lxml')
#embedded = re.compile(r'^data:.*')
#for img in soup.find_all('img'):
#src = img.get('src')
#if not src:
#continue
#if embedded.match(src):
#continue
#im = urllib.parse.urlparse(src)
#if not im.scheme:
#im = im._replace(scheme=self.parsed.scheme)
#if not im.netloc:
#im = im._replace(netloc=self.parsed.netloc)
#self._getimage(im.geturl())
def save(self):
logging.info(
"savig offline copy of\n\t%s to:\n\t%s",
self.url,
self.target
)
with open(self.target, 'wt') as f:
f.write(frontmatter.dumps(self.fm))
@property
def archiveorgurl(self):
logging.debug("trying archive.org for %s", self.url)
a = self.fetch(
"http://archive.org/wayback/available?url=%s" % self.url,
)
if not a:
logging.debug("no entry for %s on archive.org", self.url)
return None
try:
a = json.loads(a.text)
aurl = a.get(
'archived_snapshots', {}
).get(
'closest', {}
).get(
'url', None
)
logging.debug("found %s in archive.org for %s", aurl, self.url)
self.updateurl(aurl)
return self.fetch(aurl)
except Exception as e:
logging.error("archive.org parsing failed: %s", e)
return None
def fetch(self, url):
try:
r = requests.get(
self.url,
allow_redirects=True,
timeout=60,
headers=self.headers
)
if r.status_code == requests.codes.ok:
return r
except Exception as e:
return None
#def read():
#if os.path.isfile(self.target):
#with open(self.target) as f:
#self.fm = frontmatter.loads(f.read())
#return
def run(self):
if self.exists:
logging.info("offline archive for %s already exists", self.url)
return
logging.info("prepairing offline copy of %s", self.url)
if not self.skip_fetch:
r = self.fetch(self.url)
# in case it's not, try to look for an archive.org url:
if not r:
logging.warning("couldn't get live version of %s, trying archive.org", self.url)
r = self.fetch(self.archiveorgurl)
# no live and no archive.org entry :((
# howver, by miracle, I may already have a copy, so skip if it's there already
if not r:
logging.error("no live or archive version of %s found :((", self.url)
if not self.exists:
self.save()
return
self.fm.content = r.text
self._get_images()
self.save()
class Renderer(object): class Renderer(object):
def __init__(self): def __init__(self):
self.sitevars = dict(shared.config.items('site')) self.sitevars = dict(shared.config.items('site'))
@ -604,8 +356,8 @@ class Comment(BaseRenderable):
self._tmplvars = { self._tmplvars = {
'published': self.published.datetime, 'published': self.published.datetime,
'author': self.meta.get('author', {}), 'author': self.meta.get('author', {}),
'content': self.content, #'content': self.content,
'html': self.html, #'html': self.html,
'source': self.source, 'source': self.source,
'target': self.targeturl, 'target': self.targeturl,
'type': self.meta.get('type', 'webmention'), 'type': self.meta.get('type', 'webmention'),
@ -795,8 +547,9 @@ class WebImage(object):
shared.config.get('target', 'filesdir'), shared.config.get('target', 'filesdir'),
fname fname
), ),
'url': "%s/%s/%s" % ( 'url': "/%s/%s" % (
shared.config.get('site', 'url'), #'url': "%s/%s/%s" % (
#shared.config.get('site', 'url'),
shared.config.get('source', 'files'), shared.config.get('source', 'files'),
fname fname
), ),
@ -812,8 +565,9 @@ class WebImage(object):
self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url'] self.small = [e for e in self.sizes if e[1]['crop'] == False][0][1]['url']
self.target = self.sizes[-1][1]['url'] self.target = self.sizes[-1][1]['url']
else: else:
self.small = self.fallback = "%s/%s/%s" % ( self.small = self.fallback = "/%s/%s" % (
shared.config.get('site', 'url'), #self.small = self.fallback = "%s/%s/%s" % (
#shared.config.get('site', 'url'),
shared.config.get('source', 'files'), shared.config.get('source', 'files'),
"%s%s" % (self.fname, self.ext) "%s%s" % (self.fname, self.ext)
) )
@ -1129,12 +883,12 @@ class Taxonomy(BaseIter):
self.taxonomy = taxonomy self.taxonomy = taxonomy
@property #@property
def pages(self): #def pages(self):
if hasattr(self, '_pages'): #if hasattr(self, '_pages'):
return self._pages #return self._pages
self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination')) #self._pages = math.ceil(len(self.data) / shared.config.getint('common', 'pagination'))
return self._pages #return self._pages
def __repr__(self): def __repr__(self):
return "taxonomy %s with %d items" % (self.taxonomy, len(self.data)) return "taxonomy %s with %d items" % (self.taxonomy, len(self.data))
@ -1184,71 +938,48 @@ class Taxonomy(BaseIter):
def __mkdirs(self): def __mkdirs(self):
check = [self.basep, self.myp, self.feedp] check = [self.basep, self.myp, self.feedp]
if self.pages > 1:
check.append(self.pagep)
for i in range(2, self.pages+1):
subpagep = os.path.abspath(os.path.join(
self.pagep,
'%d' % i
))
check.append(subpagep)
for p in check: for p in check:
if not os.path.isdir(p): if not os.path.isdir(p):
logging.debug("creating dir %s", p) logging.debug("creating dir %s", p)
os.mkdir(p) os.mkdir(p)
def tpath(self, page): def tpath(self, page):
if page == 1: if page == 1:
return "%s/index.html" % (self.myp) p = "%s" % (self.myp)
else: else:
return "%s/%d/index.html" % (self.pagep, page) p = os.path.join(self.pagep, "%d" % page)
if not os.path.isdir(p):
logging.debug("creating dir %s", p)
os.mkdir(p)
async def grender(self, renderer): return os.path.join(p, "index.html")
#if not self.slug or self.slug is 'None':
#return
self.__mkdirs() @property
target = self.tpath(1) def is_singlepage(self):
target = target.replace('index', 'gallery') spcats = shared.config.get('common', 'onepagecategories').split(',')
if self.name in spcats and 'category' == self.taxonomy:
return True
return False
if not shared.config.getboolean('params', 'force') and os.path.isfile(target): def posttmpls(self, order='time', start=0, end=None):
ttime = int(os.path.getmtime(target)) end = end or len(self.data)
mtime = self.mtime if 'all' == order:
if ttime == mtime: return [
logging.info('taxonomy index for "%s" exists and up-to-date (lastmod: %d)', self.slug, ttime) i.tmplvars
return for k, i in list(sorted(
else: self.data.items(),
logging.info('taxonomy update needed: %s timestamp is %d, last post timestamp is %d (%s)', key=lambda value: value[1].title.lower()
target, ))
ttime, ]
mtime,
self.data[mtime].fname
)
posttmpls = [self.data[k].tmplvars for k in list(sorted( return [
self.data.keys(), reverse=True))] self.data[k].tmplvars
for k in list(sorted(
logging.info("rendering gallery to %s", target) self.data.keys(),
tmplvars = { reverse=True
'taxonomy': { ))[start:end]
'url': self.baseurl, ]
'name': self.name,
'slug': self.slug,
'taxonomy': self.taxonomy,
'lastmod': arrow.get(self.mtime).datetime
},
'site': renderer.sitevars,
'posts': posttmpls,
}
r = renderer.j2.get_template('gallery.html').render(tmplvars)
with open(target, "wt") as html:
html.write(r)
os.utime(target, (self.mtime, self.mtime))
async def render(self, renderer): async def render(self, renderer):
@ -1272,20 +1003,59 @@ class Taxonomy(BaseIter):
self.data[mtime].fname self.data[mtime].fname
) )
while page <= self.pages: if self.is_singlepage:
self.renderpage(renderer, page) pagination = len(self.data)
else:
pagination = shared.config.getint('common', 'pagination')
pages = math.ceil(len(self.data) / pagination)
while page <= pages:
self.render_page(renderer, page, pagination, pages)
page = page+1 page = page+1
self.render_feeds(renderer)
self.ping_websub
def renderpage(self, renderer, page):
pagination = int(shared.config.get('common', 'pagination')) def render_feeds(self, renderer):
start = int((page-1) * pagination) pagination = shared.config.getint('common', 'pagination')
start = 0
end = int(start + pagination) end = int(start + pagination)
posttmpls = self.posttmpls('time', start, end)
tmplvars = {
'taxonomy': {
'url': self.baseurl,
'name': self.name,
'slug': self.slug,
'taxonomy': self.taxonomy,
'lastmod': arrow.get(self.mtime).datetime
},
'site': renderer.sitevars,
'posts': posttmpls,
}
posttmpls = [self.data[k].tmplvars for k in list(sorted( target = os.path.join(self.feedp, 'index.atom')
self.data.keys(), reverse=True))[start:end]] logging.info("rendering Atom feed to %s", target)
r = renderer.j2.get_template('atom.html').render(tmplvars)
with open(target, "wt") as html:
html.write(r)
os.utime(target, (self.mtime, self.mtime))
def render_page(self, renderer, page, pagination, pages):
if self.is_singlepage:
posttmpls = self.posttmpls('all')
else:
start = int((page-1) * pagination)
end = int(start + pagination)
posttmpls = self.posttmpls('time', start, end)
target = self.tpath(page) target = self.tpath(page)
tdir = os.path.dirname(target)
if not os.path.isdir(tdir):
logging.debug("creating dir %s", tdir)
os.mkdir(tdir)
logging.info("rendering taxonomy page %d to %s", page, target) logging.info("rendering taxonomy page %d to %s", page, target)
tmplvars = { tmplvars = {
'taxonomy': { 'taxonomy': {
@ -1294,7 +1064,7 @@ class Taxonomy(BaseIter):
'slug': self.slug, 'slug': self.slug,
'taxonomy': self.taxonomy, 'taxonomy': self.taxonomy,
'paged': page, 'paged': page,
'total': self.pages, 'total': pages,
'perpage': pagination, 'perpage': pagination,
'lastmod': arrow.get(self.mtime).datetime 'lastmod': arrow.get(self.mtime).datetime
}, },
@ -1307,64 +1077,108 @@ class Taxonomy(BaseIter):
html.write(r) html.write(r)
os.utime(target, (self.mtime, self.mtime)) os.utime(target, (self.mtime, self.mtime))
if 1 == page:
#target = os.path.join(self.feedp, 'index.rss') def ping_websub(self):
#logging.info("rendering RSS feed to %s", target) if not self.taxonomy or self.taxonomy == 'category':
#r = renderer.j2.get_template('rss.html').render(tmplvars) t = shared.config.get('site', 'websuburl')
data = {
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
#def renderpage(self, renderer, page):
#pagination = int(shared.config.get('common', 'pagination'))
#start = int((page-1) * pagination)
#end = int(start + pagination)
#posttmpls = [self.data[k].tmplvars for k in list(sorted(
#self.data.keys(), reverse=True))[start:end]]
#target = self.tpath(page)
#logging.info("rendering taxonomy page %d to %s", page, target)
#tmplvars = {
#'taxonomy': {
#'url': self.baseurl,
#'name': self.name,
#'slug': self.slug,
#'taxonomy': self.taxonomy,
#'paged': page,
#'total': self.pages,
#'perpage': pagination,
#'lastmod': arrow.get(self.mtime).datetime
#},
#'site': renderer.sitevars,
#'posts': posttmpls,
#}
#r = renderer.j2.get_template('archive.html').render(tmplvars)
#with open(target, "wt") as html:
#html.write(r)
#os.utime(target, (self.mtime, self.mtime))
#if 1 == page:
##target = os.path.join(self.feedp, 'index.rss')
##logging.info("rendering RSS feed to %s", target)
##r = renderer.j2.get_template('rss.html').render(tmplvars)
##with open(target, "wt") as html:
##html.write(r)
##os.utime(target, (self.mtime, self.mtime))
#target = os.path.join(self.feedp, 'index.atom')
#logging.info("rendering Atom feed to %s", target)
#r = renderer.j2.get_template('atom.html').render(tmplvars)
#with open(target, "wt") as html: #with open(target, "wt") as html:
#html.write(r) #html.write(r)
#os.utime(target, (self.mtime, self.mtime)) #os.utime(target, (self.mtime, self.mtime))
target = os.path.join(self.feedp, 'index.atom') ## ---
logging.info("rendering Atom feed to %s", target) ## this is a joke
r = renderer.j2.get_template('atom.html').render(tmplvars) ## see http://indieweb.org/YAMLFeed
with open(target, "wt") as html: ## don't do YAMLFeeds.
html.write(r) #if 1 == page:
os.utime(target, (self.mtime, self.mtime)) #fm = frontmatter.loads('')
#fm.metadata = {
#'site': {
#'author': renderer.sitevars['author'],
#'url': renderer.sitevars['url'],
#'title': renderer.sitevars['title'],
#},
#'items': [],
#}
# --- #for p in posttmpls:
# this is a joke #fm.metadata['items'].append({
# see http://indieweb.org/YAMLFeed #'title': p['title'],
# don't do YAMLFeeds. #'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']),
if 1 == page: #'content': p['content'],
fm = frontmatter.loads('') #'summary': p['summary'],
fm.metadata = { #'published': p['published'],
'site': { #'updated': p['updated'],
'author': renderer.sitevars['author'], #})
'url': renderer.sitevars['url'],
'title': renderer.sitevars['title'],
},
'items': [],
}
for p in posttmpls: #target = os.path.join(self.feedp, 'index.yml')
fm.metadata['items'].append({ #logging.info("rendering YAML feed to %s", target)
'title': p['title'], #with open(target, "wt") as html:
'url': "%s/%s/" % ( renderer.sitevars['url'], p['slug']), #html.write(frontmatter.dumps(fm))
'content': p['content'], #os.utime(target, (self.mtime, self.mtime))
'summary': p['summary'], ## ---
'published': p['published'],
'updated': p['updated'],
})
target = os.path.join(self.feedp, 'index.yml') #if 1 == page:
logging.info("rendering YAML feed to %s", target) #if not self.taxonomy or self.taxonomy == 'category':
with open(target, "wt") as html: #t = shared.config.get('site', 'websuburl')
html.write(frontmatter.dumps(fm)) #data = {
os.utime(target, (self.mtime, self.mtime)) #'hub.mode': 'publish',
# --- #'hub.url': "%s%s" % (
#shared.config.get('site', 'url'), self.baseurl
if 1 == page: #)
if not self.taxonomy or self.taxonomy == 'category': #}
t = shared.config.get('site', 'websuburl') #logging.info("pinging %s with data %s", t, data)
data = { #requests.post(t, data=data)
'hub.mode': 'publish',
'hub.url': "%s%s" % (
shared.config.get('site', 'url'), self.baseurl
)
}
logging.info("pinging %s with data %s", t, data)
requests.post(t, data=data)
class Content(BaseIter): class Content(BaseIter):
@ -1511,9 +1325,9 @@ class Singular(BaseRenderable):
#self.photo, #self.photo,
#self.content, #self.content,
#) #)
if shared.config.getboolean('params', 'nooffline'): #if shared.config.getboolean('params', 'nooffline'):
return #return
trigger = self.offlinecopies #trigger = self.offlinecopies
#def __filter_syndication(self): #def __filter_syndication(self):
#syndications = self.meta.get('syndicate', None) #syndications = self.meta.get('syndicate', None)
@ -1680,7 +1494,7 @@ class Singular(BaseRenderable):
'text': 'CC BY 4.0', 'text': 'CC BY 4.0',
'description': 'Licensed under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. You are free to share or republish, even if modified, if you link back here and indicate the modifications, even for commercial use.' 'description': 'Licensed under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</a>. You are free to share or republish, even if modified, if you link back here and indicate the modifications, even for commercial use.'
} }
if 'journal' == self.category: elif 'journal' == self.category:
l = { l = {
'url': 'https://creativecommons.org/licenses/by-nc/4.0/', 'url': 'https://creativecommons.org/licenses/by-nc/4.0/',
'text': 'CC BY-NC 4.0', 'text': 'CC BY-NC 4.0',
@ -1894,26 +1708,26 @@ class Singular(BaseRenderable):
return self._sumhtml return self._sumhtml
@property #@property
def offlinecopies(self): #def offlinecopies(self):
# stupidly simple property caching ## stupidly simple property caching
if hasattr(self, 'copies'): #if hasattr(self, 'copies'):
return self.copies #return self.copies
copies = {} #copies = {}
for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']: #for maybe in ['bookmark-of', 'in-reply-to', 'repost-of', 'favorite-of']:
maybe = self.meta.get(maybe, False) #maybe = self.meta.get(maybe, False)
if not maybe: #if not maybe:
continue #continue
if not isinstance(maybe, list): #if not isinstance(maybe, list):
maybe = [maybe] #maybe = [maybe]
for url in maybe: #for url in maybe:
arch = OfflineArchive(url) #arch = OfflineArchive(url)
arch.run() #arch.run()
#copies[url] = arch.read() ##copies[url] = arch.read()
#self.copies = copies ##self.copies = copies
#return copies ##return copies
@property @property
@ -2157,10 +1971,6 @@ class NASG(object):
for e in [self.content.categories, self.content.tags]: for e in [self.content.categories, self.content.tags]:
for name, t in e.items(): for name, t in e.items():
await t.render(self.renderer) await t.render(self.renderer)
if name == 'photo' and t.taxonomy == 'category':
await t.grender(self.renderer)
async def __afrender(self): async def __afrender(self):
await self.content.front.render(self.renderer) await self.content.front.render(self.renderer)

39
offlinecopies.py Normal file
View file

@ -0,0 +1,39 @@
import glob
import shared
import os
import logging
import frontmatter
# remove the rest of the potential loggers
while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1])
# --- set loglevel
logging.basicConfig(
level=10,
format='%(asctime)s - %(levelname)s - %(message)s'
)
bookmarks = glob.glob('/web/petermolnar.net/petermolnar.net/content/bookmark/*.md')
bm = {}
for b in bookmarks:
with open(b, 'rt') as f:
fm = frontmatter.loads(f.read())
if not fm.metadata.get('bookmark-of'):
continue
bm[b] = fm
for fname, fm in bm.items():
logging.info('dealing with %s', fname)
url = fm.metadata.get('bookmark-of')
f, ext = os.path.splitext(os.path.basename(fname))
p = os.path.join(
shared.config.get('source', 'offlinecopiesdir'),
f
)
if os.path.isdir(p):
continue
trueurl = shared.find_archiveorgurl(url)
w = shared.wget(trueurl, dirname=f)
w.archive()

140
shared.py
View file

@ -5,6 +5,8 @@ import glob
import logging import logging
import subprocess import subprocess
import json import json
import requests
from urllib.parse import urlparse, urlunparse
from whoosh import fields from whoosh import fields
from whoosh import analysis from whoosh import analysis
@ -250,3 +252,141 @@ class Pandoc(CMDLine):
stderr stderr
) )
return stdout.decode('utf-8').strip() return stdout.decode('utf-8').strip()
class HeadlessChromium(CMDLine):
def __init__(self, url):
super().__init__('chromium-browser')
self.url = url
def get(self):
cmd = (
self.executable,
'--headless',
'--disable-gpu',
'--disable-preconnect',
'--dump-dom',
'--timeout 60',
'--save-page-as-mhtml',
"%s" % self.url
)
logging.debug('getting URL %s with headless chrome', self.url)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate()
if stderr:
logging.error(
"Error getting URL:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()
class wget(CMDLine):
def __init__(self, url, dirname=None):
super().__init__('wget')
self.url = url
self.slug = dirname or slugfname(self.url)
self.saveto = os.path.join(
config.get('source', 'offlinecopiesdir'),
self.slug
)
def archive(self):
cmd = (
self.executable,
'-e',
'robots=off',
'--timeout=360',
'--no-clobber',
'--no-directories',
'--adjust-extension',
'--span-hosts',
'--wait=1',
'--random-wait',
'--convert-links',
#'--backup-converted',
'--page-requisites',
'--directory-prefix=%s' % self.saveto,
"%s" % self.url
)
logging.debug('getting URL %s with wget', self.url)
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = p.communicate()
if stderr:
logging.error(
"Error getting URL:\n\t%s\n\t%s",
cmd,
stderr
)
return stdout.decode('utf-8').strip()
def find_realurl(url):
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
try:
r = requests.get(
url,
allow_redirects=True,
timeout=60,
headers=headers
)
except Exception as e:
logging.error('getting real url failed: %s', e)
return (None, 400)
finalurl = list(urlparse(r.url))
finalurl[4] = '&'.join(
[x for x in finalurl[4].split('&') if not x.startswith('utm_')])
finalurl = urlunparse(finalurl)
return (finalurl, r.status_code)
def find_archiveorgurl(url):
url, status = find_realurl(url)
if status == requests.codes.ok:
return url
try:
a = requests.get(
"http://archive.org/wayback/available?url=%s" % url,
)
except Exception as e:
logging.error('Failed to fetch archive.org availability for %s' % url)
return None
if not a:
logging.error('empty archive.org availability for %s' % url)
return None
try:
a = json.loads(a.text)
aurl = a.get(
'archived_snapshots', {}
).get(
'closest', {}
).get(
'url', None
)
if aurl:
logging.debug("found %s in archive.org for %s", aurl, url)
return aurl
except Exception as e:
logging.error("archive.org parsing failed: %s", e)
return None