纯 Python 实现的 Google 批量翻译
首先声明,没有什么不良动机,因为经常会用 translate.google.cn,就想着用 Python 模拟网页提交实现文档的批量翻译。据说有 API,可是要收费。 生成 TokenGoogle 为防爬虫而生成 token 的代码是 Javascript 的,且是根据网站的 TKK 值和提交的文本动态生成。 原始(晦涩) Javascript 代码var b = function (a,b) { for (var d = 0; d < b.length - 2; d += 3) { var c = b.charAt(d + 2),c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c),c = "+" == b.charAt(d + 1) ? a >>> c : a << c; a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c } return a } var tk = function (a,TKK) { for (var e = TKK.split("."),h = Number(e[0]) || 0,g = [],d = 0,f = 0; f < a.length; f++) { var c = a.charCodeAt(f); 128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023),g[d++] = c >> 18 | 240,g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224,g[d++] = c >> 6 & 63 | 128),g[d++] = c & 63 | 128) } a = h; for (d = 0; d < g.length; d++) a += g[d],a = b(a,"+-a^+6"); a = b(a,"+-3^+b+-f"); a ^= Number(e[1]) || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + "." + (a ^ h) } 易懂的 Javascript 代码function RL(a,b) { for (var d = 0; d < b.length - 2; d += 3) { var c = b.charAt(d + 2); c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c); c = "+" == b.charAt(d + 1) ? a >>> c : a << c; a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c; } return a } function TL(a,TKK) { var e = TKK.split("."); var h = Number(e[0]) || 0; var g = []; var d = 0; for (var f = 0; f < a.length; f++) { var c = a.charCodeAt(f); if (128 > c) { g[d++] = c; } else { if (2048 > c) { g[d++] = c >> 6 | 192; } else { if (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512)) { c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023); g[d++] = c >> 18 | 240; g[d++] = c >> 12 & 63 | 128; } else { g[d++] = c >> 12 | 224; g[d++] = c >> 6 & 63 | 128; } } g[d++] = c & 63 | 128; } } a = h; for (var d = 0; d < g.length; d++) { a += g[d]; a = b(a,"+-a^+6"); } a = b(a,"+-3^+b+-f"); a ^= Number(e[1]) || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + "." + (a ^ h) } Python 代码def getGoogleToken(a,TKK): def RL(a,b): for d in range(0,len(b)-2,3): c = b[d + 2] c = ord(c[0]) - 87 if 'a' <= c else int(c) c = a >> c if '+' == b[d + 1] else a << c a = a + c & 4294967295 if '+' == b[d] else a ^ c return a g = [] f = 0 while f < len(a): c = ord(a[f]) if 128 > c: g.append(c) else: if 2048 > c: g.append((c >> 6) | 192) else: if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)): f += 1 c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023) g.append((c >> 18) | 240) g.append((c >> 12) & 63 | 128) else: g.append((c >> 12) | 224) g.append((c >> 6) & 63 | 128) g.append((c & 63) | 128) f += 1 e = TKK.split('.') h = int(e[0]) or 0 t = h for item in g: t += item t = RL(t,'+-a^+6') t = RL(t,'+-3^+b+-f') t ^= int(e[1]) or 0 if 0 > t: t = (t & 2147483647) + 2147483648 result = t % 1000000 return str(result) + '.' + str(result ^ h) 获取 Token KeyGoogle 的 TKK 可以通过访问网站 https://translate.google.cn 获取,里面有段脚本里包含了“tkk:(‘xxxxxx.xxxxxx‘)”,用正则表达式截取即可。 res = requests.get('https://translate.google.cn',timeout = 3) res.raise_for_status() result = re.search(r'tkk:'(d+.d+)?'',res.text).group(1) 划分文章段落因为常从 PDF 里复制文本翻译,这样就不能依赖换行符来划分段落了。只能判断空行,作为段落的分界。 完整代码代码不长,全文黏贴如下。 import requests import re import json import time class GoogleTranslator (): _host = 'translate.google.cn' _headers = { 'Host': _host,'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3','Accept-Encoding': 'gzip,deflate,br','Content-Type': 'application/x-www-form-urlencoded;charset=utf-8','Referer': 'https://' + _host,'Connection': 'keep-alive','Cache-Control': 'max-age=0' } _language = { 'afrikaans': 'af','arabic': 'ar','belarusian': 'be','bulgarian': 'bg','catalan': 'ca','czech': 'cs','welsh': 'cy','danish': 'da','german': 'de','greek': 'el','english': 'en','esperanto': 'eo','spanish': 'es','estonian': 'et','persian': 'fa','finnish': 'fi','french': 'fr','irish': 'ga','galician': 'gl','hindi': 'hi','croatian': 'hr','hungarian': 'hu','indonesian': 'id','icelandic': 'is','italian': 'it','hebrew': 'iw','japanese': 'ja','korean': 'ko','latin': 'la','lithuanian': 'lt','latvian': 'lv','macedonian': 'mk','malay': 'ms','maltese': 'mt','dutch': 'nl','norwegian': 'no','polish': 'pl','portuguese': 'pt','romanian': 'ro','russian': 'ru','slovak': 'sk','slovenian': 'sl','albanian': 'sq','serbian': 'sr','swedish': 'sv','swahili': 'sw','thai': 'th','filipino': 'tl','turkish': 'tr','ukrainian': 'uk','vietnamese': 'vi','yiddish': 'yi','chinese_simplified': 'zh-CN','chinese_traditional': 'zh-TW','auto': 'auto' } _url = 'https://' + _host + '/translate_a/single' _params = { 'client': 'webapp','sl': 'en','tl': 'zh-CN','hl': 'zh-CN','dt': 'at','dt': 'bd','dt': 'ex','dt': 'ld','dt': 'md','dt': 'qca','dt': 'rw','dt': 'rm','dt': 'ss','dt': 't','otf': '1','ssel': '0','tsel': '0','kc': '1' } __cookies = None __googleTokenKey = '376032.257956' __googleTokenKeyUpdataTime = 600.0 __googleTokenKeyRetireTime = time.time() + 600.0 def __init__(self,src = 'en',dest = 'zh-CN',tkkUpdataTime = 600.0): if src not in self._language and src not in self._language.values(): src = 'auto' if dest not in self._language and dest not in self._language.values(): dest = 'auto' self._params['sl'] = src self._params['tl'] = dest self.googleTokenKeyUpdataTime = tkkUpdataTime self.__updateGoogleTokenKey() def __updateGoogleTokenKey(self): self.__googleTokenKey = self.__getGoogleTokenKey() self.__googleTokenKeyRetireTime = time.time() + self.__googleTokenKeyUpdataTime def __getGoogleTokenKey(self): """Get the Google TKK from https://translate.google.cn""" # TKK example: '435075.3634891900' result = '' try: res = requests.get('https://' + self._host,timeout = 3) res.raise_for_status() self.__cookies = res.cookies result = re.search(r'tkk:'(d+.d+)?'',res.text).group(1) except requests.exceptions.ReadTimeout as ex: print('ERROR: ' + str(ex)) time.sleep(1) return result def __getGoogleToken(self,a,TKK): """Calculate Google tk from TKK """ # https://www.cnblogs.com/chicsky/p/7443830.html # if text = 'Tablet Developer' and TKK = '435102.3120524463',then tk = '315066.159012' def RL(a,b): for d in range(0,3): c = b[d + 2] c = ord(c[0]) - 87 if 'a' <= c else int(c) c = a >> c if '+' == b[d + 1] else a << c a = a + c & 4294967295 if '+' == b[d] else a ^ c return a g = [] f = 0 while f < len(a): c = ord(a[f]) if 128 > c: g.append(c) else: if 2048 > c: g.append((c >> 6) | 192) else: if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)): f += 1 c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023) g.append((c >> 18) | 240) g.append((c >> 12) & 63 | 128) else: g.append((c >> 12) | 224) g.append((c >> 6) & 63 | 128) g.append((c & 63) | 128) f += 1 e = TKK.split('.') h = int(e[0]) or 0 t = h for item in g: t += item t = RL(t,'+-a^+6') t = RL(t,'+-3^+b+-f') t ^= int(e[1]) or 0 if 0 > t: t = (t & 2147483647) + 2147483648 result = t % 1000000 return str(result) + '.' + str(result ^ h) def translate(self,text): if time.time() > self.__googleTokenKeyRetireTime: self.__updateGoogleTokenKey() data = {'q': text} self._params['tk'] = self.__getGoogleToken(text,self.__googleTokenKey) result = '' try: res = requests.post(self._url,headers = self._headers,cookies = self.__cookies,data = data,params = self._params,timeout = 6) res.raise_for_status() jsonText = res.text if len(jsonText)>0: jsonResult = json.loads(jsonText) if len(jsonResult[0])>0: for item in jsonResult[0]: result += item[0] return result except Exception as ex: print('ERROR: ' + str(ex)) return '' import time from GoogleTranslator import GoogleTranslator def readFile(fileName): with open(fileName,'r') as f: paragraph = '' for line in f: if line[0]!='n': paragraph += line.strip('n') else: if len(paragraph)>0: yield paragraph paragraph = '' if len(paragraph)>0: yield paragraph main.py: def main(): translator = GoogleTranslator() count = 0 with open('C:dxpythond.txt','w',encoding='utf-8') as df: for line in readFile('C:dxpythons.txt'): if len(line) > 1: count += 1 print('r' + str(count),end = '',flush = True) df.write(line.strip() + "n") result = translator.translate(line) df.write(result.strip() + "nn") if __name__ == "__main__": startTime = time.time() main() print() print('%.2f seconds' % (time.time() - startTime)) 结束语求人不如求己。不能怕烦,代码都是人敲出来的,找不到现成的还得靠自己编。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
- 为什么python双引号在文件名中转换为连字符?
- python – 如何使用SQLAlchemy映射一个类与多个表?
- python – sqlalchemy以一对多的关系添加孩子
- Python中base64加密、解密功能实例
- django auto slug在模型中形成如django admin中的预填充字段
- python – 从scipy.stats … rvs和numpy.random的随机抽取之
- python – 由pandas dataframe中的另一个列内容填充NaN列
- “python setup.py install”不在virtualenv中安装
- Python|教你用 Python 来朗读网页!
- 基于TCP与UDP协议的socket通信