关于Python爬取招聘网站信息

2021/12/23 20:12:11

本文主要是介绍关于Python爬取招聘网站信息,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

一、什么是爬虫?

  • 简单一句话就是代替人去模拟浏览器进行网页操作,而这里我们需要的就是模拟打开网页,去获取我们需要的数据并储存在数据库中。

二、爬取代码

代码如下(实例):

#-*- codeing = utf-8 -*-
#@time : 2021/12/2 22:53
#@Author : 19310220204
#@File : spider test.py
#@Software: PyCharm

from bs4 import BeautifulSoup       #网页解析,获取数据
import re                           #正则表达式,进行文字匹配
import urllib.request,urllib.error  #制定URL,获取网页数据
import sqlite3                      #进行SQLite数据库操作
import os

def main():
    print("开始爬取。。。")
    baseurl = "https://search.51job.com/list/150300,000000,0000,00,9,99,+,2,"
    datalist=getData(baseurl)
    db_savepath="51job.db"
    saveData_db(datalist,db_savepath)
    print("爬取完毕!")

# --*-- 匹配规则 --*--
findjob=re.compile(r'"is_special_job":"",(.*?),"adid":""')
# 职位详情链接
findjob_L=re.compile(r'"job_href":"(.*?)",')
# 职位名称
findjob_n=re.compile(r'"job_name":"(.*?)",')
# 公司详情链接
findcompany_L=re.compile(r'"company_href":"(.*?)",')
# 公司名称
findcompany_n=re.compile(r'"company_name":"(.*?)",')
# 提供薪水
findsalary=re.compile(r'"providesalary_text":"(.*?)",')
# 工作区域
findarea=re.compile(r'"workarea_text":"(.*?)",')
# 公司类型
findcompany_t=re.compile(r'"companytype_text":"(.*?)",')
# 职位福利
findjob_w=re.compile(r'"jobwelf":"(.*?)",')
# 公司规模
findcompany_s=re.compile(r'"companysize_text":"(.*?)",')
# 公司主营
findcompany_i=re.compile(r'"companyind_text":"(.*?)"')
# 其他
findjob_a=re.compile(r'"attribute_text":(.*?)]')

#爬取网页
def getData(baseurl):
    datalist = []
    count=0
    for i in range(164):
        url=baseurl+str(i)+".html"
        html = askURL(url)        #保存获取到的网页源码
        print("正在爬取第%d页数据..."%i)
        # 逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        ps=soup.find_all("script",type="text/javascript")
        ps_l=str(ps[2])
        for j_list in re.findall(findjob, ps_l):
            data = []
            job_href = re.findall(findjob_L, j_list)[0]
            job_href = job_href.replace("\\", "")
            data.append(job_href)

            job_name = re.findall(findjob_n, j_list)[0]
            job_name = job_name.replace("\\", "")
            data.append(job_name)

            company_href = re.findall(findcompany_L, j_list)[0]
            company_href = company_href.replace("\\", "")
            data.append(company_href)

            company_name = re.findall(findcompany_n, j_list)[0]
            company_name = company_name.replace("\\", "")
            data.append(company_name)

            providesalary = re.findall(findsalary, j_list)[0]
            providesalary = providesalary.replace("\\", "")
            data.append(providesalary)

            workarea = re.findall(findarea, j_list)[0]
            workarea = workarea.replace("\\", "")
            data.append(workarea)

            companytype = re.findall(findcompany_t, j_list)[0]
            data.append(companytype)

            jobwelf = re.findall(findjob_w, j_list)[0]
            data.append(jobwelf)

            companysize_text = re.findall(findcompany_s, j_list)[0]
            data.append(companysize_text)

            companyind_text = re.findall(findcompany_i, j_list)[0]
            companyind_text = companyind_text.replace("\\", "")
            data.append(companyind_text)

            attribute_text = re.findall(findjob_a, j_list)[0]
            attribute_text=attribute_text.replace('"',"")+"]"
            data.append(attribute_text)

            datalist.append(data)
            count+=1
        if count==1000:
            break
    print("爬取完毕!")
    return datalist

#得到指定URL的网页内容
def askURL(url):
    #用户代理
    head={          #模拟头部伪装向服务器发送信息
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36 Edg/96.0.1054.53",
        "Accept": "text / html, application / xhtml + xml, application / xml;    q = 0.9, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange;    v = b3;    q = 0.9",
        #"Cookie": '''guid=129f518e4b1d964be3ec59fd44319ee3; _ujz=MTg0NDQxOTY5MA%3D%3D; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60150300%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CE%DF%BA%FE%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'''
    }
    request=urllib.request.Request(url,headers=head)
    html=""
    try:
        response=urllib.request.urlopen(request)
        html=response.read().decode('gbk')
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html
#保存数据
def init_db(dbpath):
    if os.path.exists(dbpath):
        print("数据库文件已存在!")
    else:
        sql = '''
        create table if not exists job(
        id integer primary key autoincrement,
        job_href text ,
        job_name text ,
        company_href text ,
        company_name text ,
        providesalary text ,
        workarea text ,
        companytype text ,
        jobwelf text ,
        companysize_text text ,
        companyind_text text ,
        attribute_text text
        )
        '''
        conn=sqlite3.connect(dbpath)
        cursor=conn.cursor()
        cursor.execute(sql)
        conn.commit()
        conn.close()

def saveData_db(datalist,savepath):
    init_db(savepath)
    conn=sqlite3.connect(savepath)
    cur=conn.cursor()
    for data in datalist:
        for index in range(len(data)):
            data[index]='"'+data[index]+'"'
        sql='''
                insert into job(
                job_href,job_name,company_href,company_name,providesalary,workarea,companytype,jobwelf,companysize_text,companyind_text,attribute_text)
                values(%s)'''%",".join(data)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.commit()
    print("爬取数据已存储到数据库文件!|",savepath)

if __name__ == '__main__':
    main()



这篇关于关于Python爬取招聘网站信息的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!


扫一扫关注最新编程教程