标签:os.path 模拟 exist job tool process mat 融资 ESS
MARK:将信息写入文件解决乱码方法,开启进程池秒爬。
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import requestsfrom requests.exceptions import RequestExceptionimport reimport jsonfrom multiprocessing import Pooldef get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return Nonedef parse_one_page(html): pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a‘ + ‘.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>‘ + ‘.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>‘, re.S) items = re.findall(pattern, html) for item in items: yield { ‘排行‘: item[0], ‘图片‘: item[1], ‘电影‘: item[2], ‘演员‘: item[3].strip()[3:], ‘上映信息‘: item[4].strip()[5:], ‘评分‘: item[5] + item[6] }def write_to_file(content): with open(‘result.txt‘, ‘a‘, encoding=‘utf-8‘) as f: f.write(json.dumps(content, ensure_ascii=False) + ‘\n‘)def main(offset): url = ‘http://maoyan.com/board/4?offset=‘ + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item)if __name__ == ‘__main__‘: # for i in range(10): # main(i*10) pool = Pool() # 进程池 多进程 pool.map(main, [i * 10 for i in range(10)]) |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
import requestsimport reimport osdef get_page(url): try: response = requests.get(url) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except: print("爬取失败")def get_url(html): pattern = re.compile(‘class="items".*?href="(.*?)"‘, re.S) urls = re.findall(pattern, html) for url in urls: if not url.startswith(‘http‘): url = ‘http://www.xiaohuar.com‘ + url yield urldef get_detail_url(detail_content): pattern = re.compile(‘id="media".*?src="(.*?)"‘, re.S) urls = re.findall(pattern, detail_content) for url in urls: if url: if url.endswith(‘.mp4‘): yield urldef download(url): root = "D://movie2//" path = root + url.split(‘/‘)[-1] try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): response = requests.get(url) # with open(path, ‘wb‘) as f: # f.write(response.content) with open(path, ‘wb‘) as f: for line in response.iter_content(): f.write(line) print("文件保存成功") else: print("文件已存在") except: print("下载失败")def main(page_num): url = ‘http://www.xiaohuar.com/list-3-{0}.html‘.format(page_num) html = get_page(url) urls = get_url(html) for url in urls: detail_content = get_page(url) detail_urls = get_detail_url(detail_content) for detail_url in detail_urls: download(detail_url)if __name__ == ‘__main__‘: for num in range(30): main(num) |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import requestsfrom pyquery import PyQueryLOGIN_URL = ‘https://github.com/login‘SESSION_URL = ‘https://github.com/session‘session = requests.session()response = session.get(LOGIN_URL)text = PyQuery(response.text)authenticity_token = text(‘#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)‘).attr(‘value‘)data = { ‘commit‘: ‘Sign in‘, ‘utf8‘: ‘?‘, ‘authenticity_token‘: authenticity_token, ‘login‘: ‘lcgsmile@qq.com‘, ‘password‘: ‘lcg@pwd.‘}response = session.post(SESSION_URL, data=data)print(response.status_code) # 200 |
配置文件config.py
|
1
2
3
4
5
6
7
|
MONGO_URL = ‘localhost‘MONGO_DB = ‘toutiao‘MONGO_TABLE = ‘toutiao‘GROUP_START = 1GROUP_END = 20KEYWORD = ‘街拍‘ |
主爬虫文件
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
import jsonimport osfrom urllib.parse import urlencodeimport pymongoimport requestsfrom bs4 import BeautifulSoupfrom requests.exceptions import ConnectionErrorimport refrom multiprocessing import Poolfrom hashlib import md5from json.decoder import JSONDecodeErrorfrom config import *client = pymongo.MongoClient(MONGO_URL, connect=False) # 多进程抓取connect=Falsedb = client[MONGO_DB]def get_page_index(offset, keyword): """ 爬取索引页 """ data = { ‘autoload‘: ‘true‘, ‘count‘: 20, ‘cur_tab‘: 3, ‘format‘: ‘json‘, ‘keyword‘: keyword, ‘offset‘: offset, } params = urlencode(data) # 将字典类型构造成url的请求参数 base = ‘http://www.toutiao.com/search_content/‘ url = base + ‘?‘ + params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(‘Error occurred‘) return Nonedef download_image(url): """ 下载图片 """ print(‘Downloading‘, url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return Nonedef save_image(content): """ 保存图片 """ file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(), md5(content).hexdigest(), ‘jpg‘) # 用一个md5哈希生成的文件名防止重复 print(file_path) if not os.path.exists(file_path): with open(file_path, ‘wb‘) as f: f.write(content)def parse_page_index(text): """ 解析数据 """ try: data = json.loads(text) # json字符串转换成字典 if data and ‘data‘ in data.keys(): for item in data.get(‘data‘): yield item.get(‘article_url‘) except JSONDecodeError: passdef get_page_detail(url): """ 请求详情页 """ try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(‘Error occurred‘) return Nonedef parse_page_detail(html, url): """ 解析详情页 """ soup = BeautifulSoup(html, ‘lxml‘) result = soup.select(‘title‘) title = result[0].get_text() if result else ‘‘ images_pattern = re.compile(‘gallery: JSON.parse\("(.*)"\)‘, re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1).replace(‘\\‘, ‘‘)) if data and ‘sub_images‘ in data.keys(): sub_images = data.get(‘sub_images‘) images = [item.get(‘url‘) for item in sub_images] for image in images: download_image(image) return { ‘title‘: title, ‘url‘: url, ‘images‘: images }def save_to_mongo(result): """ 将数据插入到MongoDB """ if db[MONGO_TABLE].insert(result): print(‘Successfully Saved to Mongo‘, result) return True return Falsedef main(offset): text = get_page_index(offset, KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) result = parse_page_detail(html, url) if result: save_to_mongo(result)if __name__ == ‘__main__‘: pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join() |
import requests import re # 1、============================================认证流程 session = requests.session() # 第一步: # 请求的URL:https://passport.lagou.com/login/login.html, # 请求的方法GET, # 请求头只包含User-agent r1 = session.get(‘https://passport.lagou.com/login/login.html‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0] # print(X_Anti_Forge_Code) # print(X_Anti_Forge_Token) # 第二步: # 1、请求的URL:https://passport.lagou.com/login/login.json, # 2、请求方法POST, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: # X-Anit-Forge-Code # X-Anit-Forge-Token # X-Requested-With # 4、请求体: # isValidate:true # username:1111111111 # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714 session.post(‘https://passport.lagou.com/login/login.json‘, headers={ ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code, ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token, ‘X-Requested-With‘: ‘XMLHttpRequest‘ }, data={ ‘isValidate‘: True, ‘username‘: ‘18611453110‘, ‘password‘: ‘70621c64832c4d4d66a47be6150b4a8e‘ } ) # 第三: # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html, # 2、请求方法GET, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: session.get(‘https://passport.lagou.com/grantServiceTicket/grant.html‘, headers={ ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) # 验证 response = session.get(‘https://www.lagou.com/resume/myresume.html‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) # print(‘18611453110‘ in response.text) # 2、============================================爬取职位信息 # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json # 2、请求的方式:POST # 请求参数: # gj:3年及以下 # xl:不要求 # jd:不需要融资 # hy:移动互联网 # px:default # yx:15k-25k # city:全国 # 3、请求头: # User-Agent # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD # X-Anit-Forge-Code:0 # X-Anit-Forge-Token:None # X-Requested-With:XMLHttpRequest # 4、请求体: # first:true # pn:1 # kd:python数据分析 from urllib.parse import urlencode params = {‘kw‘: ‘python数据分析‘} res = urlencode(params).split(‘=‘)[-1] url = ‘https://www.lagou.com/jobs/list_‘ + res # print(url) response = session.post(‘https://www.lagou.com/jobs/positionAjax.json‘, params={ # ‘gj‘: ‘3年及以下‘, # ‘xl‘: ‘不要求‘, # ‘jd‘: ‘不需要融资‘, # ‘hy‘: ‘移动互联网‘, ‘px‘: ‘default‘, ‘yx‘: ‘15k-25k‘, ‘city‘: ‘北京‘, ‘district‘: ‘海淀区‘, }, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘Referer‘: url, }) # print(response.status_code) result = response.json()[‘content‘][‘positionResult‘][‘result‘] for comanpy_info in result: fullname = comanpy_info[‘companyFullName‘] emp_num = comanpy_info[‘companySize‘] salary = comanpy_info[‘salary‘] workyear = comanpy_info[‘workYear‘] positionName = comanpy_info[‘positionName‘] positionId = comanpy_info[‘positionId‘] detail_url = ‘https://www.lagou.com/jobs/%s.html‘ % (positionId) print(detail_url) print(fullname) print(emp_num) print(salary) print(workyear) print(positionName) print(positionId) print() # 3、============================================爬取职位信息 # 第一步:请求详情页: # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html # 2、请求的方式:GET # 3、请求头: # User-Agent r1 = session.get(detail_url, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0] # 第二步:投递简历 # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json # 2、请求的方式:POST # 3、请求头: # User-Agent # Referer:detail_url # X-Anit-Forge-Code:31832262 # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7 # X-Requested-With:XMLHttpRequest # 4、请求体: # ‘positionId‘:3984845 # ‘type‘:1 # ‘force‘:True session.post(‘https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘Referer‘: detail_url, ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code, ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token, ‘X-Requested-With‘: ‘XMLHttpRequest‘ }, data={ ‘positionId‘: positionId, ‘type‘: 1, ‘force‘: True } ) print(‘投递成功‘,detail_url) lagou
import requests
import re
# 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent
r1 = session.get(‘https://passport.lagou.com/login/login.html‘,
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
},
)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)
# 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent:
# X-Anit-Forge-Code
# X-Anit-Forge-Token
# X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post(‘https://passport.lagou.com/login/login.json‘,
headers={
‘Referer‘: ‘https://passport.lagou.com/login/login.html‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
‘X-Anit-Forge-Code‘: X_Anti_Forge_Code,
‘X-Anit-Forge-Token‘: X_Anti_Forge_Token,
‘X-Requested-With‘: ‘XMLHttpRequest‘
},
data={
‘isValidate‘: True,
‘username‘: ‘18611453110‘,
‘password‘: ‘70621c64832c4d4d66a47be6150b4a8e‘
}
)
# 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
# Referer:https://passport.lagou.com/login/login.html
# User-Agent:
session.get(‘https://passport.lagou.com/grantServiceTicket/grant.html‘,
headers={
‘Referer‘: ‘https://passport.lagou.com/login/login.html‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
}
)
# 验证
response = session.get(‘https://www.lagou.com/resume/myresume.html‘,
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
}
)
# print(‘18611453110‘ in response.text)
# 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
# 请求参数:
# gj:3年及以下
# xl:不要求
# jd:不需要融资
# hy:移动互联网
# px:default
# yx:15k-25k
# city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest
# 4、请求体:
# first:true
# pn:1
# kd:python数据分析
from urllib.parse import urlencode
params = {‘kw‘: ‘python数据分析‘}
res = urlencode(params).split(‘=‘)[-1]
url = ‘https://www.lagou.com/jobs/list_‘ + res
# print(url)
response = session.post(‘https://www.lagou.com/jobs/positionAjax.json‘,
params={
# ‘gj‘: ‘3年及以下‘,
# ‘xl‘: ‘不要求‘,
# ‘jd‘: ‘不需要融资‘,
# ‘hy‘: ‘移动互联网‘,
‘px‘: ‘default‘,
‘yx‘: ‘15k-25k‘,
‘city‘: ‘北京‘,
‘district‘: ‘海淀区‘,
},
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
‘Referer‘: url,
})
# print(response.status_code)
result = response.json()[‘content‘][‘positionResult‘][‘result‘]
for comanpy_info in result:
fullname = comanpy_info[‘companyFullName‘]
emp_num = comanpy_info[‘companySize‘]
salary = comanpy_info[‘salary‘]
workyear = comanpy_info[‘workYear‘]
positionName = comanpy_info[‘positionName‘]
positionId = comanpy_info[‘positionId‘]
detail_url = ‘https://www.lagou.com/jobs/%s.html‘ % (positionId)
print(detail_url)
print(fullname)
print(emp_num)
print(salary)
print(workyear)
print(positionName)
print(positionId)
print()
# 3、============================================爬取职位信息
# 第一步:请求详情页:
# 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
# 2、请求的方式:GET
# 3、请求头:
# User-Agent
r1 = session.get(detail_url,
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
}
)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0]
# 第二步:投递简历
# 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
# 2、请求的方式:POST
# 3、请求头:
# User-Agent
# Referer:detail_url
# X-Anit-Forge-Code:31832262
# X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
# X-Requested-With:XMLHttpRequest
# 4、请求体:
# ‘positionId‘:3984845
# ‘type‘:1
# ‘force‘:True
session.post(‘https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json‘,
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘,
‘Referer‘: detail_url,
‘X-Anit-Forge-Code‘: X_Anti_Forge_Code,
‘X-Anit-Forge-Token‘: X_Anti_Forge_Token,
‘X-Requested-With‘: ‘XMLHttpRequest‘
},
data={
‘positionId‘: positionId,
‘type‘: 1,
‘force‘: True
}
)
print(‘投递成功‘,detail_url)
import requests import re # 1、============================================认证流程 session = requests.session() # 第一步: # 请求的URL:https://passport.lagou.com/login/login.html, # 请求的方法GET, # 请求头只包含User-agent r1 = session.get(‘https://passport.lagou.com/login/login.html‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0] # print(X_Anti_Forge_Code) # print(X_Anti_Forge_Token) # 第二步: # 1、请求的URL:https://passport.lagou.com/login/login.json, # 2、请求方法POST, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: # X-Anit-Forge-Code # X-Anit-Forge-Token # X-Requested-With # 4、请求体: # isValidate:true # username:1111111111 # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714 session.post(‘https://passport.lagou.com/login/login.json‘, headers={ ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code, ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token, ‘X-Requested-With‘: ‘XMLHttpRequest‘ }, data={ ‘isValidate‘: True, ‘username‘: ‘18611453110‘, ‘password‘: ‘70621c64832c4d4d66a47be6150b4a8e‘ } ) # 第三: # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html, # 2、请求方法GET, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: session.get(‘https://passport.lagou.com/grantServiceTicket/grant.html‘, headers={ ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) # 验证 response = session.get(‘https://www.lagou.com/resume/myresume.html‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) # print(‘18611453110‘ in response.text) # 2、============================================爬取职位信息 # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json # 2、请求的方式:POST # 请求参数: # gj:3年及以下 # xl:不要求 # jd:不需要融资 # hy:移动互联网 # px:default # yx:15k-25k # city:全国 # 3、请求头: # User-Agent # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD # X-Anit-Forge-Code:0 # X-Anit-Forge-Token:None # X-Requested-With:XMLHttpRequest # 4、请求体: # first:true # pn:1 # kd:python数据分析 from urllib.parse import urlencode params = {‘kw‘: ‘python数据分析‘} res = urlencode(params).split(‘=‘)[-1] url = ‘https://www.lagou.com/jobs/list_‘ + res # print(url) response = session.post(‘https://www.lagou.com/jobs/positionAjax.json‘, params={ # ‘gj‘: ‘3年及以下‘, # ‘xl‘: ‘不要求‘, # ‘jd‘: ‘不需要融资‘, # ‘hy‘: ‘移动互联网‘, ‘px‘: ‘default‘, ‘yx‘: ‘15k-25k‘, ‘city‘: ‘北京‘, ‘district‘: ‘海淀区‘, }, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘Referer‘: url, }) # print(response.status_code) result = response.json()[‘content‘][‘positionResult‘][‘result‘] for comanpy_info in result: fullname = comanpy_info[‘companyFullName‘] emp_num = comanpy_info[‘companySize‘] salary = comanpy_info[‘salary‘] workyear = comanpy_info[‘workYear‘] positionName = comanpy_info[‘positionName‘] positionId = comanpy_info[‘positionId‘] detail_url = ‘https://www.lagou.com/jobs/%s.html‘ % (positionId) print(detail_url) print(fullname) print(emp_num) print(salary) print(workyear) print(positionName) print(positionId) print() # 3、============================================爬取职位信息 # 第一步:请求详情页: # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html # 2、请求的方式:GET # 3、请求头: # User-Agent r1 = session.get(detail_url, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0] # 第二步:投递简历 # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json # 2、请求的方式:POST # 3、请求头: # User-Agent # Referer:detail_url # X-Anit-Forge-Code:31832262 # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7 # X-Requested-With:XMLHttpRequest # 4、请求体: # ‘positionId‘:3984845 # ‘type‘:1 # ‘force‘:True session.post(‘https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36‘, ‘Referer‘: detail_url, ‘X-Anit-Forge-Code‘: X_Anti_Forge_Code, ‘X-Anit-Forge-Token‘: X_Anti_Forge_Token, ‘X-Requested-With‘: ‘XMLHttpRequest‘ }, data={ ‘positionId‘: positionId, ‘type‘: 1, ‘force‘: True } ) print(‘投递成功‘,detail_url) lagou
标签:os.path 模拟 exist job tool process mat 融资 ESS
原文地址:https://www.cnblogs.com/yunlongaimeng/p/9802151.html