使用selenium爬取淘B商品列表信息入库MongoDb
2021/8/21 19:06:17
本文主要是介绍使用selenium爬取淘B商品列表信息入库MongoDb,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
selenium_taobao_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:selenium_taobao_com.py # Author:LGSP_Harold from urllib.parse import quote import pymongo from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq # firefox_options = webdriver.FirefoxOptions() # firefox_options.add_argument('--headless') # browser = webdriver.Firefox(firefox_options=firefox_options) browser = webdriver.Firefox() wait = WebDriverWait(browser, 10) client = pymongo.MongoClient('mongodb://admin:admin@localhost:27017') db = client.db_taobao_com def index_page(page): """ 抓取索引页 :param page:页码 """ print('正在爬取第', page, '页') try: if page > 1: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) submit = WebDriverWait(browser, 60, 3).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) next_page = browser.find_element_by_xpath('//li[@class="item next"]') js4 = 'arguments[0].scrollIntoView();' browser.execute_script(js4, next_page) get_products() except TimeoutException: index_page(page) def get_products(): """ 获取商品数据 """ html = browser.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'image': item.find('.pic .img').attr('data-src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text(), 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } print(product) save_to_mongo(product) def save_to_mongo(result): """ 保存至MongoDB :param result:结果 """ try: if db.collection_product.insert_one(result): print('存储到MongoDB成功') except Exception as e: print('存储到MongoDB失败') print(e) def login(): url = 'https://login.taobao.com/member/login.jhtml' browser.get(url=url) # 淘宝反爬机制会检测到selenium,无法使用账号密码登录(登录失败原因,验证码验证失败) # 破解方法:使用淘宝APP扫码登录 # username = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id'))) # password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password'))) # submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.fm-btn > button'))) # # username.clear() # user = input('输入会员名/邮箱/手机号:') # password.clear() # pwd = input('输入登录密码:') # # username.send_keys(user) # password.send_keys(pwd) # submit.click() try: qr_code = WebDriverWait(browser, 30, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'i.iconfont.icon-qrcode'))) qr_code.click() # above = browser.find_element_by_class_name('i.iconfont.icon-qrcode') # ActionChains(browser).click(above).perform() print('请用淘宝APP扫码登录') if WebDriverWait(browser, 60, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.site-nav-login-info-nick'))): url_index = WebDriverWait(browser, 60, 3).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.site-nav-menu-hd > a > span'))) url_index.click() except TimeoutException: print('登录超时') browser.quit() def main(): login() """ 遍历每一页 """ goods = input('输入您要搜索的商品:') page = int(input('输入您要爬取的总页数:')) url = 'https://s.taobao.com/search?q=' + quote(goods) browser.get(url=url) for i in range(1, page + 1): index_page(i) browser.quit() if __name__ == '__main__': main()
这篇关于使用selenium爬取淘B商品列表信息入库MongoDb的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-12-20go-zero 框架的 RPC 服务 启动start和停止 底层是怎么实现的?-icode9专业技术文章分享
- 2024-12-19Go-Zero 框架的 RPC 服务启动和停止的基本机制和过程是怎么实现的?-icode9专业技术文章分享
- 2024-12-18怎么在golang中使用gRPC测试mock数据?-icode9专业技术文章分享
- 2024-12-15掌握PageRank算法核心!你离Google优化高手只差一步!
- 2024-12-15GORM 中的标签 gorm:"index"是什么?-icode9专业技术文章分享
- 2024-12-11怎么在 Go 语言中获取 Open vSwitch (OVS) 的桥接信息(Bridge)?-icode9专业技术文章分享
- 2024-12-11怎么用Go 语言的库来与 Open vSwitch 进行交互?-icode9专业技术文章分享
- 2024-12-11怎么在 go-zero 项目中发送阿里云短信?-icode9专业技术文章分享
- 2024-12-11怎么使用阿里云 Go SDK (alibaba-cloud-sdk-go) 发送短信?-icode9专业技术文章分享
- 2024-12-10搭建个人博客网站之一、使用hugo创建个人博客网站