1 def parseHtml(html): 2 soup = BeautifulSoup(html, 'lxml') 3 # print(soup.prettify)格式化输出 4 # items = soup.find_all('div', attrs={'class': 'news-list-b'}) 5 # items = soup.select('Tag')#CSS选择器 6 # items = soup.select('.class')属性 7 # items = soup.select('#id')ID 8 # items = soup.select('Tag[attr]')标签+属性 9 # items = soup.select('.class Tag #id')空格表示子节点,组合使用 10 # items = soup.select('Tag').get_text()取文本 11 # tag.get('className')取标签属性 12 # tag['className']取标签属性 13 # tag.attrs.get('className')取标签属性 14 items = soup.select('.news-list-b .list .item .title a') 15 for item in items: 16 yield item.get('href')#attrs字典取属性