from pyquery import PyQuery as pq from lxml import etree import requests url = 'http://mil.news.sina.com.cn/roll/index.d.html?cid=57918' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400' } # 模拟请求服务器 # 发送请求的必须是url response = requests.get(url, headers=headers).content.decode('utf-8') html = etree.HTML(response) # 获取到详情页的url details_data_url = html.xpath('//div[@class="fixList"]/ul/li/a/@href') for i in details_data_url: # 请求详情页信息 details_data = requests.get(i, headers=headers).content.decode('utf-8') # 二次数据提取 # 初始化 doc = pq(details_data) # 标题 使用类选择器来获取信息 title = doc('.main-title').text() # 获取文本信息 id 选择器来获取信息 article = doc('#article').text() # with open('文件保存的名字', '文件的读写方式', '编码格式') as f: with open('新闻/{}.md'.format(title[1:4]), 'w', encoding='utf-8') as f: f.write(article)
抓取新浪军事新闻热点
最后发布:2019-06-10 13:38:46首次发布:2019-06-10 13:38:46