时间:2020-10-11来源:www.pcxitongcheng.com作者:电脑系统城
思路:
1、将需要查询城市列表,通过城市接口转换成相应的code码
2、遍历城市、职位生成url
3、通过url获取列表页面信息,遍历列表页面信息
4、再根据列表页面信息的job_link获取详情页面信息,将需要的信息以字典data的形式存在列表datas里
5、判断列表页面是否有下一页,重复步骤3、4;同时将列表datas一直传递下去
6、一个城市、职位url爬取完后,将列表datas接在列表datas_list后面,重复3、4、5
7、最后将列表datas_list的数据,遍历写在Excel里面
知识点:
1、将response内容以json形式输出,解析json并取值
2、soup 的select()和find_all()和find()方法使用
3、异常Exception的使用
4、wldt创建编辑Excel的使用
?1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import requests, time, xlwt from bs4 import BeautifulSoup class MyJob(): def __init__( self , mycity, myquery): self .city = mycity self .query = myquery self .list_url = "https://www.zhipin.com/job_detail/?query=%s&city=%s&industry=&position=" % ( self .query, self .city) self .datas = [] self .header = { 'authority' : 'www.zhipin.com' , 'method' : 'GET' , 'scheme' : 'https' , 'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' , 'accept-encoding' : 'gzip, deflate, br' , 'accept-language' : 'zh-CN,zh;q=0.9' , 'cache-control' : 'max-age=0' , 'cookie' : 'lastCity=101210100;uab_collina=154408714637849548916323;toUrl=/;c=1558272251;g=-;l=l=%2Fwww.zhipin.com%2Fuser%2Flogin.html&r=; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1555852331,1556985726,1558169427,1558272251; __a=40505844.1544087205.1558169426.1558272251.41.14.4.31; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1558272385' , 'referer' : 'https://www.zhipin.com/?ka=header-logo' , 'upgrade-insecure-requests' : '1' , 'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } #将城市转化为code码 def get_city( self ,city_list): city_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json" #获取城市 json = requests.get(city_url).json() zpData = json[ "zpData" ][ "cityList" ] list = [] for city in city_list : for data_sf in zpData: for data_dq in data_sf[ "subLevelModelList" ]: if city = = data_dq[ "name" ]: list .append(data_dq[ "code" ]) return list #获取所有页内容 def get_job_list( self , url, datas): print (url) html = requests.get(url, headers = self .header).text soup = BeautifulSoup(html, 'html.parser' ) jobs = soup.select( ".job-primary" ) for job in jobs: data = {} # 招聘id data[ "job_id" ] = job.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "a" ).get( "data-jobid" ) # 招聘链接 data[ "job_link" ] = "https://www.zhipin.com" + job.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "a" ).get( "href" ) # 招聘岗位 data[ "job_name" ] = job.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "div" , attrs = { "class" : "job-title" }).get_text() # 薪资 data[ "job_red" ] = job.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "span" , attrs = { "class" : "red" }).get_text() # 地址 #工作年限 #学历 data[ "job_address" ] = job.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "p" ).get_text().split( " " ) # 企业链接 data[ "job_company_link" ] = job.find_all( "div" , attrs = { "class" : "info-company" })[ 0 ].find( "a" ).get( "href" ) # 企业信息 data[ "job_company" ] = job.find_all( "div" , attrs = { "class" : "info-company" })[ 0 ].find( "p" ).get_text().split( " " ) # boss链接 data[ "job_publis_link" ] = job.find_all( "div" , attrs = { "class" : "info-publis" })[ 0 ].find( "img" ).get( "src" ) # boos信息 data[ "job_publis" ] = job.find_all( "div" , attrs = { "class" : "info-publis" })[ 0 ].find( "h3" ).get_text().split( " " ) time.sleep( 5 ) self .get_job_detail(data) # 获取job详情页内容 print (data) datas.append(data) # 将某条job添加到datas中,直到将当前页添加完 try : next_url = soup.find( "div" , attrs = { "class" : "page" }).find( "a" , attrs = { "class" : "next" }).get( "href" ) #if next_url[-1] =="3": # 第二页自动抛异常 if next_url in "javascript:;" : # 最后一页自动抛异常 raise Exception() except Exception as e: print ( "最后一页了;%s" % e) return datas # 返回所有页内容 else : time.sleep( 5 ) next_url = "https://www.zhipin.com" + next_url self .get_job_list(next_url, datas) return datas # 返回所有页内容 #获取详情页内容 def get_job_detail( self , data): print (data[ "job_link" ]) html = requests.get(data[ "job_link" ], headers = self .header).text soup = BeautifulSoup(html, 'html.parser' ) # 招聘公司 data[ "detail_content_name" ] = soup.find_all( "div" , attrs = { "class" : "detail-content" })[ 0 ].find( "div" , attrs = { "class" : "name" }).get_text() # 福利 data[ "detail_primary_tags" ] = soup.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "div" , attrs = { "class" : "job-tags" }).get_text().strip() # 招聘岗位 data[ "detail_primary_name" ] = soup.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "h1" ).get_text() # 招聘状态 data[ "detail_primary_status" ] = soup.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "div" , attrs = { "class" : "job-status" }).get_text() # 薪资 data[ "detail_primary_salary" ] = soup.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "span" , attrs = { "class" : "salary" }).get_text() # 地址 #工作年限 #学历 data[ "detail_primary_address" ] = soup.find_all( "div" , attrs = { "class" : "info-primary" })[ 0 ].find( "p" ).get_text() # 工作地址 data[ "detail_content_address" ] = soup.find_all( "div" , attrs = { "class" : "detail-content" })[ 0 ].find( "div" , attrs = { "class" : "location-address" }).get_text() # 职位描述 data[ "detail_content_text" ] = soup.find_all( "div" , attrs = { "class" : "detail-content" })[ 0 ].find( "div" , attrs = { "class" : "text" }).get_text().strip().replace( ";" , "\n" ) # boss名字 data[ "detail_op_name" ] = soup.find_all( "div" , attrs = { "class" : "detail-op" })[ 1 ].find( "h2" , attrs = { "class" : "name" }).get_text() # boss职位 data[ "detail_op_job" ] = soup.find_all( "div" , attrs = { "class" : "detail-op" })[ 1 ].find( "p" , attrs = { "class" : "gray" }).get_text().split( "·" )[ 0 ] # boss状态 data[ "detail_op_status" ] = soup.find_all( "div" , attrs = { "class" : "detail-op" })[ 1 ].find( "p" , attrs = { "class" : "gray" }).get_text().split( "·" )[ 1 ] #将获取的数据写入Excel def setExcel( self , datas_list): book = xlwt.Workbook(encoding = 'utf-8' ) table = book.add_sheet( "boss软件测试" ) table.write( 0 , 0 , "编号" ) table.write( 0 , 1 , "招聘链接" ) table.write( 0 , 2 , "招聘岗位" ) table.write( 0 , 3 , "薪资" ) table.write( 0 , 4 , "地址" ) table.write( 0 , 5 , "企业链接" ) table.write( 0 , 6 , "企业信息" ) table.write( 0 , 7 , "boss链接" ) table.write( 0 , 8 , "boss信息" ) table.write( 0 , 9 , "detail详情" ) i = 1 for data in datas_list: table.write(i, 0 , data[ "job_id" ]) table.write(i, 1 , data[ "job_link" ]) table.write(i, 2 , data[ "job_name" ]) table.write(i, 3 , data[ "job_red" ]) table.write(i, 4 , data[ "job_address" ]) table.write(i, 5 , data[ "job_company_link" ]) table.write(i, 6 , data[ "job_company" ]) table.write(i, 7 , data[ "job_publis_link" ]) table.write(i, 8 , data[ "job_publis" ]) table.write(i, 10 , data[ "detail_content_name" ]) table.write(i, 11 , data[ "detail_primary_name" ]) table.write(i, 12 , data[ "detail_primary_status" ]) table.write(i, 13 , data[ "detail_primary_salary" ]) table.write(i, 14 , data[ "detail_primary_address" ]) table.write(i, 15 , data[ "detail_content_text" ]) table.write(i, 16 , data[ "detail_op_name" ]) table.write(i, 17 , data[ "detail_op_job" ]) table.write(i, 18 , data[ "detail_op_status" ]) table.write(i, 19 , data[ "detail_primary_tags" ]) table.write(i, 20 , data[ "detail_content_address" ]) i + = 1 book.save(r 'C:\%s_boss软件测试.xls' % time.strftime( '%Y%m%d%H%M%S' )) print ( "Excel保存成功" ) if __name__ = = '__main__' : city_list = MyJob(" "," ").get_city([" 杭州"]) query_list = [ "软件测试" , "测试工程师" ] datas_list = [] for city in city_list: for query in query_list: myjob = MyJob(city, query) datas = myjob.get_job_list(myjob.list_url, myjob.datas) datas_list.extend(datas) myjob.setExcel(datas_list) |
以上就是python使用bs4爬取boss直聘静态页面的详细内容,
2023-03-17
python flask项目打包成docker镜像发布的过程2023-03-17
python调试模块ipdb详解2023-03-17
python使用openai生成图像的超详细教程python cron定时任务触发接口自动化巡检 apscheduler报错:Run time of job …… next run at: ……)” was missed by misfire_grace_time参数 找到任务超时的根本原因...
2023-03-15