Python 多线程、线程池、协程 爬虫
2021/9/11 12:34:47
本文主要是介绍Python 多线程、线程池、协程 爬虫,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
多线程生产者消费者模型爬虫
import queue import requests from bs4 import BeautifulSoup import threading import time import random def craw(url): r = requests.get(url=url) return r.text def parse(html): soup = BeautifulSoup(html, "html.parser") links = soup.find_all("a", class_="post-time-title") return [(link["href"], link.get_test()) for link in links] def do_craw(url_queue: queue.Queue, html_queue: queue.Queue): while True: url = url_queue.get() html = craw(url) html_queue.put(html) print(threading.current_thread().name, url) time.sleep(random.randint(1,2)) def do_parse(html_queue:queue.Queue, f_out): while True: html = html_queue.get() results = parse(html) for result in results: f_out.write(str(result) + "\n") print(threading.current_thread().name, html_queue.qsize()) time.sleep(1) if __name__ == '__main__': url_queue = queue.Queue() html_queue = queue.Queue() for url in ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]: url_queue.put(url) for idx in range(3): t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw-{idx}") t.start() file = open("02.data.txt", "w") for idx in range(2): d = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse-{idx}") d.start()
多线程池爬虫
from concurrent.futures import ThreadPoolExecutor, as_completed import requests from bs4 import BeautifulSoup spider_url = ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)] def craw(url): r = requests.get(url=url) return r.text def parse(html): soup = BeautifulSoup(html, "html.parser") links = soup.find_all("a", class_="post-time-title") return [(link["href"], link.get_test()) for link in links] # craw with ThreadPoolExecutor() as pool: htmls = pool.map(craw, spider_url) htmls = list(zip(spider_url, htmls)) for k, v in htmls: print(k, len(v)) with ThreadPoolExecutor() as pool: futures = {} for url, html in htmls: future = pool.submit(parse, html) futures[future] = url # for k, v in futures.items(): # print(v, k.result()) for future in as_completed(futures): print(futures[future], future.result())
协程
import asyncio import aiohttp spider_url = ["https://www.cnblogs.com/taozhengquan/p/14966535.html"]*50 # 信号量控制爬虫数量 semaphore = asyncio.Semaphore(10) async def async_craw(url): async with semaphore: print("craw url:", url) async with aiohttp.ClientSession() as session: async with session.get(url) as resp: result = await resp.text() print(url, len(result)) loop = asyncio.get_event_loop() tasks = [ loop.create_task(async_craw(item)) for item in spider_url ] loop.run_until_complete(asyncio.wait(tasks))
这篇关于Python 多线程、线程池、协程 爬虫的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-14获取参数学习:Python编程入门教程
- 2024-11-14Python编程基础入门
- 2024-11-14Python编程入门指南
- 2024-11-13Python基础教程
- 2024-11-12Python编程基础指南
- 2024-11-12Python基础编程教程
- 2024-11-08Python编程基础与实践示例
- 2024-11-07Python编程基础指南
- 2024-11-06Python编程基础入门指南
- 2024-11-06怎么使用python 计算两个GPS的距离功能-icode9专业技术文章分享