Python + selenium爬取B站用户信息(iP池+pymsql存储)
2021/7/19 2:05:09
本文主要是介绍Python + selenium爬取B站用户信息(iP池+pymsql存储),对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
import bs4 import json import time import pymysql import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options def getPage(mid, n, href): headers = { 'User-Agent': 'Mozilla/5.0', 'Cookie': "", 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Referer': href+'/fans/fans', } params = ( ('vmid', str(mid)), ('pn', str(n)), ('ps', '50'), ('order', 'desc'), ) proxy = ["116.117.134.134", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"] i = 0 while True: print(i) if i < len(proxy): proxies = { 'https://': proxy[i] } response = requests.get('https://api.bilibili.com/x/relation/followers', proxies=proxies, headers=headers, params=params) if response.status_code == 200: break i = i + 1 if i + 1 == len(proxy): print("IP 全部失效") break return response def getUserDetails(mid): cookies = {'domain': '/', 'expires': 'false', 'httpOnly': 'false', 'name': 'buvid3', 'path': 'Fri, 29 Jan 2021 08:50:10 GMT', 'value': '7A29BBDE-VA94D-4F66-QC63-D9CB8568D84331045infoc,bilibili.com'} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Origin': 'https://space.bilibili.com', 'Connection': 'keep-alive', 'Referer': 'https://space.bilibili.com/546195/fans/fans', 'Cache-Control': 'max-age=0', } params = ( ('mid', str(mid)), ('jsonp', 'jsonp'), ) proxy = ["112.95.18.193", "112.80.248.73", "47.99.209.194", "1.181.48.68", "60.255.151.81", "202.108.22.5", "223.104.38.117"] i = 0 while True: print(i) if i < len(proxy): proxies = { 'https://': proxy[i] } response = requests.get('https://api.bilibili.com/x/space/acc/info', proxies=proxies, headers=headers, cookies=cookies, params=params) if response.status_code == 200: break i = i + 1 if i + 1 == len(proxy): print("IP 全部失效") break return response def getUpInfoBySelenium(href, mid): chrome_options = Options() chrome_options.add_argument('--headless') browser = webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe") browser.get(href) try: html = browser.execute_script("return document.documentElement.outerHTML") soup = BeautifulSoup(browser.page_source, 'html.parser') focus = soup.find('p', 'n-data-v space-attention').text # 关注数 fans = soup.find('p', 'n-data-v space-fans').text # 粉丝数 print("关注数" + str(focus), "粉丝数" + str(fans)) finally: browser.close() def viplevel(vip): if vip == 0: vipname = '非会员' elif vip == 1: vipname = '会员' else: vipname = '大会员' return vipname def createDb(): #-------------------------------------------------------------------------------------------------- db = pymysql.connect(host='localhost', user='root', password='admin', port=3306) cursor = db.cursor(); sql = 'CREATE DATABASE bilibili' cursor.execute(sql) cursor.close() #-------------------------------------------------------------------------------------------------- db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili') cursor = db.cursor() sql = 'CREATE TABLE IF NOT EXISTS up (id int(11) NOT NULL AUTO_INCREMENT, ' \ 'up_id VARCHAR(255) NOT NULL,up_name VARCHAR(255) NOT NULL, ' \ 'sex VARCHAR(10) NOT NULL, birthday VARCHAR(255),' \ 'focus VARCHAR(255),fans VARCHAR(255),area VARCHAR(255),' \ 'praise VARCHAR(255),view VARCHAR(255),' \ 'sign VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,' \ 'PRIMARY KEY (id,up_id))' cursor.execute(sql) db.close() #--------------------------------------------------------------------------------------------------- db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili') cursor = db.cursor() sql = 'CREATE TABLE IF NOT EXISTS fans (id int(11) NOT NULL AUTO_INCREMENT,' \ 'up_id VARCHAR(255) NOT NULL,fans_id VARCHAR(255) NOT NULL,' \ 'fans_name VARCHAR(255) NOT NULL, sex VARCHAR(10) NOT NULL,' \ 'fans_level VARCHAR(10) NOT NULL,viplevel VARCHAR(255) NOT NULL,' \ 'time VARCHAR(255) NOT NULL,' \ 'PRIMARY KEY (id))' cursor.execute(sql) db.close() def insertUp(mid, name, sex, sign, birthday, title): db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili') cursor = db.cursor() sql = 'INSERT INTO up(up_id,up_name,sex,sign,birthday,title) values(%s,%s,%s,%s,%s,%s)' val = (mid, name, sex, sign, birthday, title) try : cursor.execute (sql, val) db.commit() except: db. rollback () db.close() def insertFans(up_mid, fans_mid, time, uname, viplevel, sex, level): db = pymysql.connect(host='localhost', user='root', password='admin', port=3306, db='bilibili') cursor = db.cursor() sql = 'INSERT INTO fans(up_id,fans_id,fans_name,sex,fans_level,viplevel,time) values(%s,%s,%s,%s,%s,%s,%s)' val = (up_mid, fans_mid, uname, sex, level, viplevel, time) try: cursor.execute(sql, val) db.commit() except: db.rollback() db.close() if __name__ == '__main__': up_id = ["546195", "9824766", "777536", "321173469", "517327498", "122879", "20165629", "14110780", "62540916", "19577966"] for i in range(len(up_id)): href = "https://space.bilibili.com/" + str(up_id[i]) + "/video" up = getUserDetails(up_id[i]) #获取up主个人信息(json) json_obj = json.loads(up.text) up_mid = json_obj['data']['mid'] name = json_obj['data']['name'] sex = json_obj['data']['sex'] sign = json_obj['data']['sign'] level = json_obj['data']['level'] birthday = json_obj['data']['birthday'] title = json_obj['data']['official']['title'] print("up主uid:"+str(up_mid), "用户名:"+name, "性别:"+sex, "留言:"+sign, "生日:"+birthday, "称号:"+title) # ------------------------------------------------ # print("开始 selenium") getUpInfoBySelenium(href, str(up_mid)) # 打印粉丝数 print("结束 selenium") # ------------------------------------------------ # print("粉丝数据:", end='') for j in range(1, 5): print("j:" + j) r = getPage(up_id[i], j, href) json_obj = json.loads(r.text) #返回json格式 for entry in json_obj['data']['list']: fans_mid = entry['mid'] mtime = entry['mtime'] uname = entry['uname'] vip = entry['vip']['vipType'] fansDetails = getUserDetails(fans_mid) json_obj = json.loads(fansDetails.text) sex = json_obj['data']['sex'] level = json_obj['data']['level'] print("uid:" + str(fans_mid), "关注时间:"+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime)), "用户名:" + uname, "vip等级:" + viplevel(vip), "性别:"+sex, "账户等级:"+str(level)) time.sleep(5) # 防止封ip
本文初始定义了几个UP主的id号,因为这几个UP主的粉丝量较大,故易获取粉丝信息
本文通过构造UP主的空间信息,获取粉丝量,并访问粉丝信息,通过python的pymsql库链接本地mysql进行数据存储,其中的SQL代码已经内嵌进去。对于B站高频访问会有IP限制,所以本文也采用了代理IP池的方法,不过本文并没有进一步通过构建时时IP池进行刷新IP,这一点受制于有效IP过少的限制,所以仅使用了几个IP进行替换。
另,B站的UP主粉丝信息并不能持续翻页,这点受制于B站网站的用户信息限制或本博主技术不到位,无法突破。故每个UP主的粉丝信息只能爬取几十页的粉丝目录。
这篇关于Python + selenium爬取B站用户信息(iP池+pymsql存储)的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-24Python编程基础详解
- 2024-11-21Python编程基础教程
- 2024-11-20Python编程基础与实践
- 2024-11-20Python编程基础与高级应用
- 2024-11-19Python 基础编程教程
- 2024-11-19Python基础入门教程
- 2024-11-17在FastAPI项目中添加一个生产级别的数据库——本地环境搭建指南
- 2024-11-16`PyMuPDF4LLM`:提取PDF数据的神器
- 2024-11-16四种数据科学Web界面框架快速对比:Rio、Reflex、Streamlit和Plotly Dash
- 2024-11-14获取参数学习:Python编程入门教程