def next_url(page): """获取下一页链接后缀""" a = page.find('a', text=re.compile("^后页")) if a: return a.attrs['href'] else: return None
保存数据到 excel 中
1 2 3 4 5 6 7 8 9 10
import xlwt
def xls_save(workbook, data, count): """保存数据到excel""" for d in data: for i in range(len(d)): # print(d[i]) workbook.write(count, i, d[i]) count = count + 1 return workbook, count
from urllib import request from chardet import detect from bs4 import BeautifulSoup import re import xlwt import os
def get_soup(page_url): """获取源码""" with request.urlopen(page_url) as fp: byt = fp.read() det = detect(byt) return BeautifulSoup(byt.decode(det['encoding']), 'lxml')
def get_data(page): """ 获取数据 selelct 可以用css语法获取标签,find可以获取标签里的 attrs 属性值 """ data = [] ol = page.find('ol', attrs={'class': 'grid_view'}) for li in ol.select('li'): # 单元 tmp = [] # 多个 title titles = [] img_url = li.find('img').attrs['src'].strip() tmp.append(img_url) for span in li.findAll('span', attrs={"class": re.compile('')}): if span.attrs['class'][0] == 'title': titles.append(span.string.strip()) if span.attrs['class'][0] == 'rating_num': tmp.append(span.string.strip()) if span.attrs['class'][0] == 'inq': tmp.append(span.string.strip()) tmp.insert(0, titles) data.append(tmp) return data
def next_url(page): """获取下一页链接后缀""" a = page.find('a', text=re.compile("^后页")) if a: return a.attrs['href'] else: return None
def xls_save(workbook, data, count): """保存数据到excel""" for d in data: for i in range(len(d)): # print(d[i]) workbook.write(count, i, d[i]) count = count + 1 return workbook, count