【Python爬虫】之百度翻译sgin解密
2021/7/27 1:06:06
本文主要是介绍【Python爬虫】之百度翻译sgin解密,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
目录
Fiddler配合谷歌游览器抓取数据
谷歌游览器调试
Python代码
Fiddler配合谷歌游览器抓取数据
首先发现百度翻译是先对输入的词进行查询语言类别,通过这个地址https://fanyi.baidu.com/langdetect
其次还发现了对提交的数据进行了url编码,输入的“我是超人”,查询返回的是"lan":"zh",到目前为止还没有发现有加密的地方,咱们继续往下看真正发送请求的链接
谷歌游览器调试
https://fanyi.baidu.com/v2transapi?from=zh&to=en 这个链接才是真正发送请求的链接。一眼就看到了一个sign参数,不用想这肯定是加密了(因为我搜过这个值,没有 = =!),其次token这个参数其实是这个服务器的返回值,从历史返回里就可以找到了。主要解决了这个sgin就能成,在谷歌游览器里切换到【Sources】,然后按Ctrl + Shift + F 搜sign
发现居然有56个,太多了,我们加个冒号试试,注意是英文冒号。
这次只有14个,很好,我们全部下断点(点进去,搜sign: 然后在行数的前面点一下),然后重新输入要翻译的词,看看会不会断下来
发现成功断下来了,是把L(e) 的值赋给了sign,然后我们在控制台打印下L(e)的值确定是我们想要的值。 而e的值就是“你好”。我们跟进去看下这个函数(按F11进入函数)
然后发现看了下没有什么特别的加密,就只有这一段JS而已,JS代码如下
window = {}; var i = null; function n(r, o) { for (var t = 0; t < o.length - 2; t += 3) { var a = o.charAt(t + 2); a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), a = "+" === o.charAt(t + 1) ? r >>> a : r << a, r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a } return r } function e(r) { var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); if (null === o) { var t = r.length; t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) } else { for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))), C !== h - 1 && f.push(o[C]); var g = f.length; g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) } var u = void 0, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); u = null !== i ? i : (i = window[l] || "") || ""; u = '320305.131321201'; for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { var A = r.charCodeAt(v); 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128) } for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b], p = n(p, F); return p = n(p, D), p ^= s, 0 > p && (p = (2147483647 & p) + 2147483648), p %= 1e6, p.toString() + "." + (p ^ m) }
有一点需要注意的是,js里有根据 i 的值来取 u 的值,在调试的时候发现 i 的值是固定的320305.131321201,所以我在js里直接把u = ‘320305.131321201’,在python里执行下就成功了
Python代码
import requests from urllib.parse import urlencode import execjs import time import json #处理请求头 def getHeaders(cookies): headers = { 'Host':'fanyi.baidu.com', 'Connection':'keep-alive', 'Accept':'*/*', 'X-Requested-With':'XMLHttpRequest', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Accept-Language':'zh-CN,zh;q=0.9', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36', 'Cookie':cookies } return headers #处理cookies def getCookies(t): cookies = 'BAIDUID=2798F941BEE3BAD44CC9E6225279FF4A:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=' + t + '; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=' + t + ';' return cookies #取sign,用到了execjs模块 def getSign(wd): ctx = execjs.compile(""" window = {}; var i = null; function n(r, o) { for (var t = 0; t < o.length - 2; t += 3) { var a = o.charAt(t + 2); a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), a = "+" === o.charAt(t + 1) ? r >>> a : r << a, r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a } return r } function e(r) { var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); if (null === o) { var t = r.length; t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) } else { for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) "" !== e[C] && f.push.apply(f, a(e[C].split(""))), C !== h - 1 && f.push(o[C]); var g = f.length; g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) } var u = void 0, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); u = null !== i ? i : (i = window[l] || "") || ""; u = '320305.131321201'; for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { var A = r.charCodeAt(v); 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), S[c++] = A >> 18 | 240, S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, S[c++] = A >> 6 & 63 | 128), S[c++] = 63 & A | 128) } for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) p += S[b], p = n(p, F); return p = n(p, D), p ^= s, 0 > p && (p = (2147483647 & p) + 2147483648), p %= 1e6, p.toString() + "." + (p ^ m) } """) return ctx.call("e", wd) #取输入的语言 def getLan(wd,headers): url = "https://fanyi.baidu.com/langdetect" data={'query':wd} #对post提交的表单进行url编码 resp = requests.post(url,data=urlencode(data).encode("utf-8"),headers=headers) lan = '' #如果状态码 == 200 就说明正常请求正常 if resp.status_code == 200: #json解析 data_json = json.loads(resp.text) msg = data_json['msg'] if msg == "success": lan = data_json['lan'] else: print(resp.text) return lan def fanyi(wd,lan,sign,token,headers): language = "" if lan == 'zh': language = "en" elif lan == 'en': language = "zh" else: language = "en" url = "https://fanyi.baidu.com/v2transapi?from=" + lan + "&to=" + language #print(url) data = { 'from':lan, 'to':language, 'query':wd, 'transtype':'realtime', 'simple_means_flag':'3', 'sign':sign, 'token':token, 'domain':'common' } resp = requests.post(url=url,data=urlencode(data).encode("utf-8"),headers=headers) retdata = "" if resp.status_code == 200: print("请求成功") retdata = (resp.text.encode("utf-8")) return retdata def getToken(headers): url = "https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh" resp = requests.get(url,headers=headers) print(resp.text) #这里我偷懒了自己取一下返回的token吧 if __name__=="__main__": #取10位时间戳 t = round(time.time()) cookies = getCookies(str(t)) headers = getHeaders(cookies) #getToken(headers) wd = input("请输入要翻译的内容:") lan = getLan(wd,headers) sign = getSign(wd) json_data = fanyi(wd,lan,sign,'97f41ef953422689ecd99065d10c7775',headers) json_data = json.loads(bytes(json_data).decode("utf-8")) print(json_data)
就到这,好了天黑了,该溜了!
这篇关于【Python爬虫】之百度翻译sgin解密的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-25Python编程基础:变量与类型
- 2024-11-25Python编程基础与实践
- 2024-11-24Python编程基础详解
- 2024-11-21Python编程基础教程
- 2024-11-20Python编程基础与实践
- 2024-11-20Python编程基础与高级应用
- 2024-11-19Python 基础编程教程
- 2024-11-19Python基础入门教程
- 2024-11-17在FastAPI项目中添加一个生产级别的数据库——本地环境搭建指南
- 2024-11-16`PyMuPDF4LLM`:提取PDF数据的神器