Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)
2021/5/5 20:26:00
本文主要是介绍Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三),对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
1、环境安装
pip install lxml
2、解析原理
- 使用通用爬虫爬取网页数据
- 实例化etree对象,且将页面数据加载到该对象中
- 使用xpath函数结合xpath表达式进行标签定位和指定数据提取
3、实战案例
- 项目需求:解析房天下新房的相关数据
import requests import os from lxml import etree import json import csv if __name__ == '__main__': url = 'https://huizhou.newhouse.fang.com/house/s/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' } if not os.path.exists('./fangtianxiaLibs'): os.makedirs('./fangtianxiaLibs') response = requests.get(url=url,headers=headers) # 手动设置响应数据的编码格式 response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id = "newhouse_loupai_list"]/ul/li') # 爬取的数据信息放到列表里面 datas = [] for li in li_list: # 解析标题 try: detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href')[0] if detail_url != []: detail_url = 'https:'+detail_url detail_text = requests.get(url=detail_url,headers=headers).text # 字符串替换Url后缀 detail_url_new = detail_url.replace('.htm','/housedetail.htm') tree = etree.HTML(detail_text) # 解析二级页面的描述和价格(均价) title = tree.xpath('//div[@class="information"]//div[@class="tit"]/h1/strong/text()')[0] price = "".join(tree.xpath('//div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/h3/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/span/text() | //div[@class="information_li mb5"]/div[@class="inf_left fl mr10"]/text()')).strip('\n \t') # 二级页面再次发起请求 detail_text_new = requests.get(url=detail_url_new,headers=headers).text tree_new = etree.HTML(detail_text_new) # 解析详情页信息 tree_list = tree_new.xpath('//div[@id="Configuration"]') # print(tree_list[0].xpath('./h3/text()')) for index in tree_list: zhoubian = "".join(index.xpath('./h3/text()')).strip('\n \t \r ') jiaotong = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li[@class="jiaotong_color"]/text()')).strip('\n \t \r ') qita = "".join(index.xpath('./ul[@class="sheshi_zb"]/li/span/text()|./ul[@class="sheshi_zb"]/li/text()')).strip('\n \t \r ') desc = zhoubian+":"+jiaotong+':'+qita+'\n' dic = { 'title':title, 'desc':desc, 'price':price } datas.append(dic) except Exception as msg: pass # print('报错原因:{}'.format(msg)) fileName = './fangtianxiaLibs/'+title+'.txt' print(datas) title_header = ['title','desc','price'] with open(fileName,'a',encoding='utf-8') as fp: writer = csv.DictWriter(fp,title_header) writer.writeheader() writer.writerows(datas)
- 项目需求:解析图片数据:http://pic.netbian.com/4kmeinv/
import requests from lxml import etree url = 'http://pic.netbian.com/4kmeinv/' headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response = requests.get(url=url,headers=headers) #获取页面原始编码格式 print(response.encoding) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0] img_name = li.xpath('./a/img/@alt')[0] img_name = img_name.encode('iso-8859-1').decode('gbk') print(img_url,img_name) - 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/
import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response = requests.get(url=url,headers=headers) #获取页面原始编码格式 print(response.encoding) page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul//li') for li in li_list: city_name = li.xpath('./a/text()')[0] city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0] print(city_name,city_url)
- 项目需求:下载网站站点简历中的图片数据:https://sc.chinaz.com/
import requests from lxml import etree import os # 新建文件夹 if not os.path.exists('./jianliLibs'): os.makedirs('./jianliLibs') # 站点第一层 进入简历门户站点 url = 'https://sc.chinaz.com/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' } response_text = requests.get(url=url,headers=headers).text # 解析获取模板信息 tree = etree.HTML(response_text) # 解析出简历模板Url def page_index(latest): for index in range(1,latest): if index == 1: muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] # print("one",muban_url) else: muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@class="nav"]//li[@class="nos no3"]/a/@href')[3] + 'index_{}.html'.format(index) # print("two",muban_url) # 模板简历站点获取每个简历的信息 response = requests.get(muban_url,headers=headers) # 手动设置响应数据的编码格式 response.encoding = 'utf-8' muban_text = response.text # print(muban_text) # 解析获取简历信息 jianli_tree = etree.HTML(muban_text) # 解析出简历信息的Url jianli_url_list = jianli_tree.xpath('//div[@class="main_list jl_main"]//a/@href') # print(jianli_url_list) for jianli_url in jianli_url_list: jianli_url = "https:"+jianli_url # print(jianli_url) # 第三层获取简历信息 jianli_detail = requests.get(jianli_url,headers=headers).text detail_tree = etree.HTML(jianli_detail) img_src_list = detail_tree.xpath('//div[@class="show_warp jl_warp clearfix"]//img/@src') for img_src in img_src_list: img_src = 'https:'+img_src # print(img_src) img_src_content = requests.get(img_src,headers=headers).content # print(img_src_content) # 生成图片的名称 imgName = img_src.split('/')[-2] # print(imgName) # 图片路径 imgPath = './jianliLibs/'+imgName+'.jpg' # 持久化存储 with open(imgPath, 'wb') as fp: fp.write(img_src_content) print('简历:'+imgName, '下载成功!!!') if __name__ == '__main__': while True: try: values = int(input('请输入站点页分页数:')) page_index(values) except Exception as msg: print('输入错误,错误信息为{}'.format(msg)) finally: break
这篇关于Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-11-24Python编程基础详解
- 2024-11-21Python编程基础教程
- 2024-11-20Python编程基础与实践
- 2024-11-20Python编程基础与高级应用
- 2024-11-19Python 基础编程教程
- 2024-11-19Python基础入门教程
- 2024-11-17在FastAPI项目中添加一个生产级别的数据库——本地环境搭建指南
- 2024-11-16`PyMuPDF4LLM`:提取PDF数据的神器
- 2024-11-16四种数据科学Web界面框架快速对比:Rio、Reflex、Streamlit和Plotly Dash
- 2024-11-14获取参数学习:Python编程入门教程