import html
import logging
import os
import re
import time
from hashlib import sha1
from urllib.parse import urlparse, urlunsplit
from tornado import gen
from tornado import httpclient
from typing import Optional # noqa: F401
try:
import chardet
except ImportError: # type should be Optional[Module] but there's no module in mypy? TODO
chardet = None # type: ignore
from apertium_apy.utils import translation
from apertium_apy.handlers.translate import TranslateHandler
class TranslateWebpageHandler(TranslateHandler):
def url_repl(self, base, attr, quote, aurl):
a = urlparse(aurl)
if a.netloc == '':
newurl = urlunsplit((base.scheme,
base.netloc,
a.path,
a.query,
a.fragment))
else:
newurl = aurl
return ' {a}={q}{u}{q}'.format(a=attr, u=newurl, q=quote)
def unescape(self, page):
# First workaround old bug that exists in a lot of
# Windows-based web pages, see
# http://stackoverflow.com/a/1398921/69663 :
page = page.replace('', '–')
# Unescape all other entities the regular way:
return html.unescape(page)
def clean_html(self, page, urlbase):
page = self.unescape(page)
if urlbase.netloc in ['www.avvir.no', 'avvir.no']:
page = re.sub(r'([a-zæøåášžđŋ])=([a-zæøåášžđŋ])',
'\\1\\2',
page)
page = page.replace('\u00ad', '') # soft hyphen
return page
def html_to_text(self, page, url):
encoding = 'utf-8'
if chardet:
encoding = chardet.detect(page).get('encoding', 'utf-8') or encoding
base = urlparse(url)
text = self.clean_html(page.decode(encoding), base) # type: str
return re.sub(r' (href|src)=([\'"])(..*?)\2',
lambda m: self.url_repl(base, m.group(1), m.group(2), m.group(3)),
text)
def set_cached(self, pair, url, translated, origtext):
"""Cache translated text for a pair and url to memory, and disk.
Also caches origtext to disk; see cache_path."""
if pair not in self.url_cache:
self.url_cache[pair] = {}
elif len(self.url_cache[pair]) > self.max_inmemory_url_cache:
self.url_cache[pair] = {}
ts = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
self.url_cache[pair][url] = (ts, translated)
if self.url_cache_path is None:
logging.info('No --url-cache-path, not storing cached url to disk')
return
dirname, basename = self.cache_path(self.url_cache_path, pair, url)
os.makedirs(dirname, exist_ok=True)
statvfs = os.statvfs(dirname)
if (statvfs.f_frsize * statvfs.f_bavail) < self.min_free_space_disk_url_cache:
logging.warn('Disk of --url-cache-path has < {} free, not storing cached url to disk'.format(
self.min_free_space_disk_url_cache))
return
# Note: If we make this a @gen.coroutine, we will need to lock
# the file to avoid concurrent same-url requests clobbering:
path = os.path.join(dirname, basename)
with open(path, 'w') as f:
f.write(ts)
f.write('\n')
f.write(translated)
origpath = os.path.join(dirname, pair[0])
with open(origpath, 'w') as f:
f.write(origtext)
def cache_path(self, url_cache_path, pair, url):
"""Give the directory for where to cache the translation of this url,
and the file name to use for this pair."""
hsh = sha1(url.encode('utf-8')).hexdigest()
dirname = os.path.join(url_cache_path,
# split it to avoid too many files in one dir:
hsh[:1], hsh[1:2], hsh[2:])
return (dirname, '{}-{}'.format(*pair))
def get_cached(self, pair, url):
if not self.url_cache_path:
return None
if pair not in self.url_cache:
self.url_cache[pair] = {}
if url in self.url_cache[pair]:
logging.info('Got cache from memory')
return self.url_cache[pair][url]
dirname, basename = self.cache_path(self.url_cache_path, pair, url)
path = os.path.join(dirname, basename)
if os.path.exists(path):
logging.info('Got cache on disk, we want to retranslate in background …')
with open(path, 'r') as f:
return (f.readline().strip(), f.read())
def retranslate_cache(self, pair, url, cached):
"""If we've got something from the cache, and it isn't in memory, then
it was from disk. We want to retranslate anything we found on
disk, since it's probably using older versions of the language
pair.
"""
mem_cached = self.url_cache.get(pair, {}).get(url)
if mem_cached is None and cached is not None and self.url_cache_path is not None:
dirname, _ = self.cache_path(self.url_cache_path, pair, url)
origpath = os.path.join(dirname, pair[0])
if os.path.exists(origpath):
return open(origpath, 'r').read()
@gen.coroutine
def get(self):
pair = self.get_pair_or_error(self.get_argument('langpair'),
# Don't yet know the size of the text, and don't want to fetch it unnecessarily:
-1)
if pair is None:
return
self.note_pair_usage(pair)
prefs = self.get_argument('prefs', default='')
mode_path = self.pairs['%s-%s' % pair]
url = self.get_argument('url')
if not url.startswith('http'):
url = 'http://' + url
got304 = False
cached = self.get_cached(pair, url)
if prefs:
logging.warn("Web translation with prefs doesn't work with caching yet")
request = httpclient.HTTPRequest(url=url,
# TODO: tweak timeouts:
connect_timeout=20.0,
request_timeout=20.0)
try:
response = yield httpclient.AsyncHTTPClient().fetch(request)
except Exception as e:
logging.info('%s exception has occurred', e)
self.send_error(404, explanation='{} on fetching url: {}'.format('Error 404', e))
return
try:
response = yield httpclient.AsyncHTTPClient().fetch(request, raise_error=True)
except httpclient.HTTPError as e:
if e.code == 304:
got304 = True
logging.info('304, can use cache')
else:
logging.error(e)
self.send_error(503, explanation='{} on fetching url: {}'.format(response.code, response.error))
return
if got304 and cached is not None:
translation_catpipeline = translation.CatPipeline # type: ignore
translated = yield translation_catpipeline().translate(cached[1])
else:
if response.body is None:
self.send_error(503, explanation='got an empty file on fetching url: {}'.format(url))
return
page = response.body # type: bytes
try:
to_translate = self.html_to_text(page, url)
except UnicodeDecodeError as e:
logging.info("/translatePage '{}' gave UnicodeDecodeError {}".format(url, e))
self.send_error(503, explanation="Couldn't decode (or detect charset/encoding of) {}".format(url))
return
before = self.log_before_translation()
translated = yield translation.translate_html_mark_headings(to_translate, mode_path, prefs)
self.log_after_translation(before, len(to_translate))
self.set_cached(pair, url, translated, to_translate)
self.send_response({
'responseData': {
'translatedText': self.maybe_strip_marks(self.mark_unknown, pair, translated),
},
'responseDetails': None,
'responseStatus': 200,
})
retranslate = self.retranslate_cache(pair, url, cached)
if got304 and retranslate is not None:
logging.info('Retranslating {}'.format(url))
translated = yield translation.translate_html_mark_headings(retranslate, mode_path, prefs)
logging.info('Done retranslating {}'.format(url))
self.set_cached(pair, url, translated, retranslate)