本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理
世界那么大,我想去看看。
要么读书,要么旅游,身体和心灵必须有一个在路上。
想必大家心里都向往旅游,那么旅游中的行程安排和地区热门景点有哪些呢?
可能都需要在网上去找旅游攻略,今天就带大家采集旅游网站的景点数据。
- 系统分析网页性质
- 结构化的数据解析
- csv数据保存
import csv import requests import parsel from concurrent.futures import ProcessPoolExecutor import multiprocessing
python 3.6
pycharm
requests
parsel
csv
1.找数据所在的URL地址
2.发送网络请求
3.数据的解析(我们需要的数据)
4.数据的保存
lock = multiprocessing.Lock() # 创建进程锁对象 def send_request(url): """请求数据""" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'} html_data = requests.get(url=url, headers=headers).text return html_data
def parse_data(html_data): selector = parsel.Selector(html_data) lis = selector.xpath('//ul[@class="plcCitylist"]/li') for li in lis: travel_place = li.xpath('.//h3/a/text()').get() # 目的地 travel_people = li.xpath('.//p[@class="beento"]/text()').get() # 去过的人数 travel_hot = li.xpath('.//p[@class="pois"]/a/text()').getall() # 热门景点 travel_hot = [hot.strip() for hot in travel_hot] travel_hot = '、'.join(travel_hot) travel_url = li.xpath('.//h3/a/@href').get() # 目的地详情页url travel_imgUrl = li.xpath('./p/a/img/@src').get() # 图片url print(travel_place, travel_people, travel_hot, travel_url, travel_imgUrl, sep=' | ') yield travel_place, travel_people, travel_hot, travel_url, travel_imgUrl def save_data(data_generator): with open('穷游网.csv', mode='a', encoding='utf-8', newline='') as f: csv_write = csv.writer(f) for data in data_generator: lock.acquire() # 加锁 csv_write.writerow(data) lock.release() # 释放锁 def main(url): html_data = send_request(url) parse_result = parse_data(html_data) save_data(parse_result) if __name__ == '__main__': # main('https://place.qyer.com/china/citylist-0-0-1') with ProcessPoolExecutor(max_workers=13) as executor: for page in range(1, 172): url = f'https://place.qyer.com/china/citylist-0-0-{page}/' executor.submit(main, url)