python简单爬虫

2021/5/19 1:27:19

本文主要是介绍python简单爬虫,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

python简单爬虫某网站简历模板

from lxml import etree
import requests


def main():
    url = "https://sc.chinaz.com/jianli/free.html"
    header = {"User-Agent":"hahhaha"}
    res = requests.get(url = url ,headers = header)
    print(res.status_code)
    tree = etree.HTML(res.text)	#使用xpath导入页面源码
    div_list = tree.xpath('//div[@id="container"]/div') #定位到div标签因为有好多所以返回列表
    #print(div_list)
    jl_list = []
    for div in div_list:
        href = div.xpath('./a/@href')  #取出每个标签下的网站地址
        #haha = "http:"+href
        jl_list.append(href) 	#添加到列表
        #print(href)
    #print(jl_list)
    main2(jl_list)


def main2(jl_list):
    xz_list = []
    for xz in jl_list: 	#从列表中把地址拿出
        url = "http:"+xz[0]  #组建正确地址
        header = {"User-Agent":"haha"}
        res = requests.get(url = url,headers = header)
        #print(res.status_code)
        tree = etree.HTML(res.text)
        li = tree.xpath('//div[@class="clearfix mt20 downlist"]//li')[0] #定位标签
        href = li.xpath('./a/@href')	#取出地址
        xz_list.append(href)
    #print(xz_list)
    main3(xz_list)

def main3(xz_list):
    t = 0
    for rarxz in xz_list:
        url = rarxz[0]
        header = {"User-Agent":"haha"}
        res = requests.get(url = url , headers = header)
        with open("./jx模板/"+str(t)+".rar","wb") as f:  #以二进制写的方式打开文件
            f.write(res.content) 	#写入文件
            print("is ok ")
            t +=1
            f.close()


if __name__=="__main__":
    main()


这篇关于python简单爬虫的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!


扫一扫关注最新编程教程