123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- # This is part of Kaylee
- # -- this code is licensed GPLv3
- # Copyright 2015-2016 Clayton G. Hobbs
- # Portions Copyright 2013 Jezra
-
- import hashlib
- import json
- import re
-
- import requests
-
- class LanguageUpdater:
-
- def __init__(self, config):
- self.config = config
-
- def update_language_if_changed(self):
- """Test if the language has changed, and if it has, update it"""
- if self.language_has_changed():
- self.update_language()
- self.save_language_hash()
-
- def language_has_changed(self):
- """Use SHA256 hashes to test if the language has changed"""
- # Load the stored hash from the hash file
- try:
- with open(self.config.hash_file, 'r') as f:
- hashes = json.load(f)
- self.stored_hash = hashes['language']
- except (IOError, KeyError, TypeError):
- # No stored hash
- self.stored_hash = ''
-
- # Calculate the hash the language file has right now
- hasher = hashlib.sha256()
- with open(self.config.strings_file, 'rb') as sfile:
- buf = sfile.read()
- hasher.update(buf)
- self.new_hash = hasher.hexdigest()
-
- return self.new_hash != self.stored_hash
-
- def update_language(self):
- """Update the language using the online lmtool"""
- print('Updating language using online lmtool')
-
- host = 'http://www.speech.cs.cmu.edu'
- url = host + '/cgi-bin/tools/lmtool/run'
-
- # Submit the corpus to the lmtool
- response_text = ""
- with open(self.config.strings_file, 'rb') as corpus:
- files = {'corpus': corpus}
- values = {'formtype': 'simple'}
-
- r = requests.post(url, files=files, data=values)
- response_text = r.text
-
- # Parse response to get URLs of the files we need
- path_re = r'.*<title>Index of (.*?)</title>.*'
- number_re = r'.*TAR([0-9]*?)\.tgz.*'
- for line in response_text.split('\n'):
- # If we found the directory, keep it and don't break
- if re.search(path_re, line):
- path = host + re.sub(path_re, r'\1', line)
- # If we found the number, keep it and break
- elif re.search(number_re, line):
- number = re.sub(number_re, r'\1', line)
- break
-
- lm_url = path + '/' + number + '.lm'
- dic_url = path + '/' + number + '.dic'
-
- self._download_file(lm_url, self.config.lang_file)
- self._download_file(dic_url, self.config.dic_file)
-
- def save_language_hash(self):
- new_hashes = {'language': self.new_hash}
- with open(self.config.hash_file, 'w') as f:
- json.dump(new_hashes, f)
-
- def _download_file(self, url, path):
- r = requests.get(url, stream=True)
- if r.status_code == 200:
- with open(path, 'wb') as f:
- for chunk in r:
- f.write(chunk)
|