使用urllib&BeautifulSoup爬取盗墓笔记网站所有书籍

python学习网 2018-06-28 20:59:02
'''
download_html:接受url,返回html和BeautifulSoup实例
spider接受html返回url和数据
process_data:处理字符串及保存数据
controller:控制,调用
'''
# coding=utf-8

__author__ = 'Leslie'

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re,collections,os


# 接受url,返回html和BeautifulSoup实例

def download_html(url):

html = urlopen(url).read().decode('utf-8') #获取页面数据

soup = BeautifulSoup(html,'lxml') #实例化BeautifulSoup

return html,soup


#spider接受html返回url队列、title队列、数据

def spider(html=False,soup=False):

# 爬取首页的url和title

if not html and soup:
queue_url = collections.deque() # 存储url队列
queue_title = collections.deque() # 存储标题队列

# 定位元素,提取a标签href和title属性

for item in soup.find_all("div", {"class": "box"}):

for Alabel in item.find_all("a"):

queue_url.append(Alabel["href"])

# 处理title字符串中多余的字符

Str1 = Alabel["title"]
Str2 = '_盗墓笔记9在线阅读_盗墓笔记全集'

if Str2 in Str1:
Str1 = Str1.replace(Str2, '')

index = Str1.index(']')
Str1 = Str1[index + 1:].strip()
queue_title.append(Str1)

return queue_url,queue_title

# 爬取文字

if html and soup:

all_p_label = soup.find("div", class_="content-body").find_all("p")

return all_p_label

# 处理字符串及保存数据

def process_data(Data,title):

# 标题名去除不可用字符[\/?:*<>"|]

while '\\' in title:
index = title.index('\\')
title = title[:index] + title[index + 1:]

matchList = re.findall('[/?:*<>"|]*', title)
matchStr = ''.join(matchList) # '?><'

title = list(title)
for j in matchStr:
title.remove(j)

title = ''.join(title)

#保存文件的绝对路径
abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

#去除文字中多余的字符串如: www.setupu.com
CMP = re.compile("(http://)?([a-zA-Z]+\.)+com") # 编译正则表达式对象

for i in Data:
each_string = str(i.string).replace(" ", "").strip()

if each_string != "None":
Match = re.search(CMP, each_string) # 匹配字符串

# 保存文字到txt文件
with open(abspath, 'a', encoding='utf-8') as fp:
if Match != None:
Newstring = each_string[:Match.span()[0]]
fp.write(Newstring + '\n')
else:
fp.write(each_string + '\n')

# 控制,调用
def controller(url):

# 获取要爬取的url队列和文件名标题
html,soup = download_html(url)
queue_url,queue_title = spider(soup=soup)

# 循环爬取url知道队列为空
while url:
url = queue_url.popleft()
title = queue_title.popleft() + '.txt'
print(title,url)

html,soup = download_html(url)
text_data = spider(html,soup)
process_data(text_data,title)

url = r'http://www.seputu.com/'
os.mkdir(os.path.abspath(r'.\daomubiji1'))
controller(url)
阅读(1278) 评论(0)