Skip to main content

Financial Data Analysis 3 Dangdang shop product crawler - crawler books as an example-- requests&bs4

· 2 Minutes to read
Allen Ma

Case (II) crawler preview

Project one: Dangdang online shop product crawler - crawler books as an example

This case is a crawl of relevant content using the bs4 library find method.

 -*- coding: utf-8 -*-
import requests
import csv
from bs4 import BeautifulSoup as bs
#Access to web information
def request_dandan(url):
try:
#User Agents
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
r = requests.get(url,headers=headers)
if r.status_code == 200:
return r.text
except requests.RequestException:
return None

#Storage column name
def write_item_to_file():
csv_file = open('dangdang.csv', 'w', newline='', encoding="utf-8")
writer = csv.writer(csv_file)
writer.writerow(['书名','购买链接','纸质书价格','电子书价格','电子书链接','书的详细介绍','书的封面地址','评论地址','作者','出版时间','出版社'])
print('列名已成功放入CSV中')
#Parsing web pages and writing to csv files
def parse_dangdang_write(html):
csv_file = open('dangdang.csv', 'a', newline='')
writer = csv.writer(csv_file)
#Parsing web pages
soup = bs(html, 'html.parser')
class_tags = ['line'+str(x) for x in range(1,61)]
for class_tag in class_tags:
li = soup.find('li',class_=class_tag)
book_name = li.find('a',class_='pic').get('title') # Book Title
paperbook_price = li.find('span',class_='search_now_price').text #Paperback prices
try:
ebook_price = li.find('a',class_='search_e_price').find('i').text #E-book prices
ebook_link = li.find('a',class_='search_e_price').get('href') #Ebook links
except:
ebook_price = ''
ebook_link = ''
detail = li.find('p',class_='detail').text #Book Details
book_purchase_link = li.find('a',class_='pic').get('href') #Detailed purchase links for each book
book_cover_link = li.find('a',class_='pic').find('img').get('src')#Book cover address
comment_link = li.find('a',class_='search_comment_num').get('href') #Comment Address
author = li.find('p',class_='search_book_author').find('span').text # Author of the book
public_time = li.find('p',class_='search_book_author').find('span').next_sibling.text[2:]#Publication date
public = li.find('p',class_='search_book_author').find('span').next_sibling.next_sibling.text[3:]#Publisher
writer.writerow([book_name, book_purchase_link, paperbook_price, ebook_price, ebook_link, detail, book_cover_link, comment_link, author, public_time, public])
#writer.writerow(['book title', 'buy link', 'paperback price', 'ebook price', 'ebook link', 'book details', 'book cover address', 'review address', 'author', 'publication date', 'publisher'])
csv_file.close()

if __name__ == '__main__':
write_item_to_file()
for page in range(1, 10): # Crawl 9 pages of data into a csv file
url = 'http://search.dangdang.com/?key=python%C5%C0%B3%E6&act=input&page_index=' + str(page)
html = request_dandan(url) # Access to web information
parse_dangdang_write(html) # Parsing web pages and writing to csv files
print('第{}页数据成功放入CSV中'.format(page))

Results of the run. 在这里插入图片描述 在这里插入图片描述