python爬虫-1.07-MaoyanSpider---持久化存MySQL储
2021/10/7 19:40:56
本文主要是介绍python爬虫-1.07-MaoyanSpider---持久化存MySQL储,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
docekr-compose的MySQL yaml文件
version: '3' services: mysql: image: 'mysql/mysql-server:5.7' restart: always container_name: mysql environment: MYSQL_ROOT_PASSWORD: 123456 command: --default-authentication-plugin=mysql_native_password --character-set-server=utf8mb4 --collation-server=utf8mb4_general_ci --explicit_defaults_for_timestamp=true --lower_case_table_names=1 --max_allowed_packet=128M; ports: - 3306:3306
pymysql模块的使用
""" p8 pymysql模块与MySQL数据库交互流程 创建数据库连接对象 db=pymysql.connect(xx,xx,xx,xx,xx) 创建游标对象 cursor=db.cursor() 执行SQL命令 cursor.execute(sql语句,[xx,xx]) 提交到数据库执行 db.commit() 关闭游标 cursor.close() 断开数据库连接 db.close() """ import pymysql # #1、创建数据库连接对象 + 游标对象 # db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8') # cursor = db.cursor() # #2、利用游标对象的execute()方法执行SQL命令 # ins = 'insert into maoyantab values(%s,%s,%s)' # cursor.execute(ins, ['大话西游之月光宝盒', '周星驰', '1993-01-01']) # #3、提交到数据库执行:commit() # db.commit() # #4、关闭游标 + 断开数据库连接 # cursor.close() # db.close() """ pymysql之excutemany()方法 作用: ---减少数据库IO次数,提高效率,一次性插入多条表记录 示例: ins = 'insert into maoyantab values(%s,%s,%s)' film_li = [ ('大话西游之月光宝盒', '周星驰', '1993-01-01'), '大话西游之月光宝盒', '周星驰', '1994-01-01' ] cursor.executemany(ins,film_li) """ #利用excutemany()方法,一次性插入多条表记录,提升数据库的效率 #1、创建数据库连接对象 + 游标对象 db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8') cursor = db.cursor() #2、利用游标对象的execute()方法执行SQL命令 ins = 'insert into maoyantab values(%s,%s,%s)' film_li = [ ('大话西游之月光宝盒', '周星驰', '1993'), ('大话西游之月光宝盒', '周星驰', '1994'), ('大话西游之大圣娶亲', '周星驰', '1995'), ('大话西游之大圣娶亲', '周星驰', '1996') ] cursor.executemany(ins, film_li) #3、提交到数据库执行:commit() db.commit() #4、关闭游标 + 断开数据库连接 cursor.close() db.close()
使用pymysql的execute()方法 单次存储抓取到的数据
""" 猫眼电影TOP100抓取 - execute() """ from urllib import request import random import time import re import pymysql class MaoyanSpider: def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36', 'Cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9' } # 添加计数变量 self.i = 0 # 连接数据库 + 游标对象 self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8') self.cursor = self.db.cursor() def get_html(self, url): """获取HTML内容""" req = request.Request(url=url, headers=self.headers) res = request.urlopen(req) html = res.read().decode() # print(html) # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): """提取HTML内容""" regex = '<div class="movie-item-info">.*?title="(.*?)".*?">.*?</a></p>.*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>' pattern = re.compile(regex, re.S) r_list = pattern.findall(html) # 调用数据处理函数 self.save_html(r_list) def save_html(self, r_list): """数据处理函数""" ins = 'insert into maoyantab values(%s,%s,%s)' for r in r_list: film_li = [ r[0].strip(), r[1].strip(), r[2].strip() ] self.cursor.execute(ins, film_li) self.db.commit() print(film_li) self.i += 1 def run(self): """程序运行调配""" for page in range(0, 91, 10): url = self.url.format(page) # print(url) self.get_html(url) # 控制数据抓取频率 time.sleep(random.randint(1, 2)) # 所有页面数据抓取完成后关闭 self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.run() print('电影数量:', spider.i)
使用pymysql的executemany()方法 一次性存储抓取到的数据
""" 猫眼电影TOP100抓取 - executemany() """ from urllib import request import random import time import re import pymysql class MaoyanSpider: def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36', 'Cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9' } # 添加计数变量 self.i = 0 # 连接数据库 + 游标对象 + 存放所有电影信息的大列表 self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8') self.cursor = self.db.cursor() self.all_film_list = [] def get_html(self, url): """获取HTML内容""" req = request.Request(url=url, headers=self.headers) res = request.urlopen(req) html = res.read().decode() # print(html) # 直接调用解析函数 self.parse_html(html) def parse_html(self, html): """提取HTML内容""" regex = '<div class="movie-item-info">.*?title="(.*?)".*?">.*?</a></p>.*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>' pattern = re.compile(regex, re.S) r_list = pattern.findall(html) # 调用数据处理函数 # print(r_list) self.save_html(r_list) def save_html(self, r_list): """数据处理函数""" for r in r_list: film_t = ( r[0].strip(), r[1].strip(), r[2].strip() ) # 把所有电影的元组添加到大列表中,用于最后的 executemany() self.all_film_list.append(film_t) print(film_t) self.i += 1 def run(self): """程序运行调配""" for page in range(0, 91, 10): url = self.url.format(page) # print(url) self.get_html(url) # 控制数据抓取频率 time.sleep(random.randint(1, 2)) # 所有页面数据抓取完成,一次性存入到MySQL数据库 ins = 'insert into maoyantab values(%s,%s,%s)' self.cursor.executemany(ins, self.all_film_list) self.db.commit() self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.run() print('电影数量:', spider.i)
这篇关于python爬虫-1.07-MaoyanSpider---持久化存MySQL储的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-16MySQL资料:新手入门教程
- 2024-11-16MySQL资料:新手入门教程
- 2024-11-15MySQL教程:初学者必备的MySQL数据库入门指南
- 2024-11-15MySQL教程:初学者必看的MySQL入门指南
- 2024-11-04部署MySQL集群项目实战:新手入门教程
- 2024-11-04如何部署MySQL集群资料:新手入门指南
- 2024-11-02MySQL集群项目实战:新手入门指南
- 2024-11-02初学者指南:部署MySQL集群资料
- 2024-11-01部署MySQL集群教程:新手入门指南
- 2024-11-01如何部署MySQL集群:新手入门教程