当当网

时间：2017-04-22 14:39:50 阅读：133 评论：0 收藏：0 [点我收藏+]

标签：headers exce 图书 window htm chrome count for beautiful

import requests

import os

import re

import urllib

import urllib.parse

from bs4 import BeautifulSoup

count=0#小类的个数

all_url=‘http://category.dangdang.com/?ref=www-0-C‘

headers={‘user-agent‘:"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",

‘refer‘:‘http://category.dangdang.com/?ref=www-0-C‘}

main_html=requests.post(all_url,headers=headers)#获得当当网主网页所有信息

main_soup=BeautifulSoup(main_html.text,‘lxml‘)#解析主网页

classify_kind=main_soup.find("div", class_="classify_books", id="floor_1").find_all(‘div‘, class_=‘classify_kind‘)#找到所有的图书分类（每个大类）

for kind in classify_kind:#遍历每个大类

all_a=kind.find_all(‘a‘)#找到每个大类里的小类

for a in all_a:#遍历每个小类

if a.get_text() == "更多":#因为每个大类里的小类标签不是小类，而是更多，所以可能运行出错。这里用continue跳过

continue

else:

count+=1#小类加一

classify_kind_name=a.get_text()#得到小类的文本信息

href=a.get(‘href‘)#得到每个小类的链接

print(classify_kind_name,href)

page_html = requests.post(href,headers=headers)#获取每个小类的网页所有信息

max_span = re.findall(r"\d+",BeautifulSoup(page_html.text, ‘lxml‘).find(‘ul‘, class_=‘paging‘).find_all(‘span‘)[-2].get_text())[0]#获取小类图书一共多少页

print(max_span)

#m_span+1，就超过了实际的页数

for page in range(1, int(max_span)):#遍历小类的每个页

page_url=href.replace("cp01","pg"+str(page)+"-cp01")#替换小类链接里的cp01为pg+页数+cp01,得到每个小类的每页链接

print(page_url)

book_html=requests.post(page_url,headers=headers)#获取每个小类每页的所有内容

book_soup=BeautifulSoup(book_html.text,‘lxml‘)

book_url=book_soup.find(‘div‘,class_=‘con shoplist‘).find_all(‘li‘)#解析网页，找到所有的图书

for li in book_url:#遍历每本图书

try:#这里用个try，因为运行时会出错，因为下面获取图书的信息，可是有的图书信息为空，这时则会报错。用try捕捉。并且continue过去。

bookname=li.find("p",class_="name").string#得到图书名

author=li.find("p",class_="author").a.string#得到图书的作者

publisher=li.find("p",class_="publishing").a.string#得到图书的出版社

publishe_time=re.findall(r"/ .+ ",str(li.find("p", class_="publishing_time")))[0].replace(" ","").replace("/","")#得到出版时间

price=li.find("p",class_="price").span.string#得到图书价格

print(bookname, author,publisher,publishe_time,price)

except:continue

当当网

标签：headers exce 图书 window htm chrome count for beautiful

原文地址：http://www.cnblogs.com/Mdudu/p/6747560.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行