1. 多进程爬虫

　　对于数据量较大的爬虫，对数据的处理要求较高时，可以采用python多进程或多线程的机制完成，多进程是指分配多个CPU处理程序，同一时刻只有一个CPU在工作，多线程是指进程内部有多个类似"子进程"同时在协同工作。python中有多种多个模块可完成多进程和多线程的工作，此处此用multiprocessing模块完成多线程爬虫，测试过程中发现，由于站点具有反爬虫机制，当url地址和进程数目较多时，爬虫会报错。

2. 代码内容

#!/usr/bin/python
#_*_ coding:utf _*_

import re
import time 
import requests
from multiprocessing import Pool

duanzi_list = []

def get_web_html(url):
	'''
	@params:获取url地址web站点的html数据
	'''
	headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
	try:
		req = requests.get(url,headers=headers)
		if req.status_code == 200:
			response = req.text.encode('utf8')
	except Exception as e:
		print e
	return response

def scrap_qiushi_info(url):
	'''
	@params:url,获取段子数据信息
	'''
	html = get_web_html(url)
	usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
	levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
	laugh_counts = re.findall('<span class="stats-vote">.*?<i class="number">(\d+)</i>',html,re.S|re.M)
	comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
	contents = re.findall('<div class="content">.*?<span>(.*?)</span>',html,re.S|re.M)
	for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
		information = {
			"username": username.strip(),
			"level": level.strip(),
			"laugh_count": laugh_count.strip(),
			"comment_count": comment_count.strip(),
			"content": content.strip()
		}
		duanzi_list.append(information)
	time.sleep(1)
	return duanzi_list
			
def normal_scapper(url_lists):
	'''
	定义调用函数，使用普通的爬虫函数爬取数据
	'''
	begin_time = time.time()
	for url in url_lists:
		scrap_qiushi_info(url)
	end_time = time.time()
	print "普通爬虫一共耗费时长:%f" % (end_time - begin_time)

def muti_process_scapper(url_lists,process_num=2):
	'''
	定义多进程爬虫调用函数，使用mutiprocessing模块爬取web数据
	'''
	begin_time = time.time()
	pool = Pool(processes=process_num)
	pool.map(scrap_qiushi_info,url_lists)
	end_time = time.time()
	print "%d个进程爬虫爬取所耗费时长为:%s" % (process_num,(end_time - begin_time))

def main():
	'''
	定义main()函数，程序入口，通过列表推倒式获取url地址，调用爬虫函数
	'''
	url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
	normal_scapper(url_lists)
	muti_process_scapper(url_lists,process_num=2)


if __name__ == "__main__":
	main()

python学习网

分类

Python多线程爬虫获取糗事百科段子(Python爬虫实战2)

1. 多进程爬虫

2. 代码内容

3. 多线程机制