python爬取图文新闻_python爬取新闻需要什么软件

2017-02-16 回答

需求：

用到的python模块：

import re # 正则表达式

import bs4 # beautiful soup 4 解析模块

import urllib2 # 络访问模块

import news #自己定义的新闻结构

import codecs #解决编码问题的关键，使用codecs.open打开文件

import sys #1解决不同页面编码问题

其中bs4需要自己装一下，安装方法可以参考：windows命令行下pip安装python whl包

程序：

#coding=utf-8

import re # 正则表达式

import bs4 # beautiful soup 4 解析模块

import urllib2 # 络访问模块

import news #自己定义的新闻结构

import codecs #解决编码问题的关键，使用codecs.open打开文件

import sys #1解决不同页面编码问题

reload(sys) # 2

sys.setdefaultencoding(‘utf-8’) # 3

# 从首页获取所有链接

def getallurl(home):

html = urllib2.urlopen(home).read().decode(‘utf8’)

soup = bs4.beautifulsoup(html, ‘html.parser’)

pattern = ‘http://w+.baijia.baidu.com/article/w+’

links = soup.find_all(‘a’, href=re.compile(pattern))

for link in links:

url_set.add(link[‘href’])

def getnews(url):

global newscount,maxnewscount #全局记录新闻数量

while len(url_set) != 0:

try:

# 获取链接

url = url_set.pop()

url_old.add(url)

# 获取代码

html = urllib2.urlopen(url).read().decode(‘utf8’)

# 解析

soup = bs4.beautifulsoup(html, ‘html.parser’)

pattern = ‘http://w+.baijia.baidu.com/article/w+’ # 链接匹配规则

links = soup.find_all(‘a’, href=re.compile(pattern))

# 获取url

for link in links:

if link[‘href’] not in url_old:

url_set.add(link[‘href’])

# 获取信息

article = news.news()

article.url = url # url信息

page = soup.find(‘div’, {‘id’: ‘page’})

article.title = page.find(‘h1’).get_text() # 标题信息

info = page.find(‘div’, {‘class’: ‘article-info’})

article.date = info.find(‘span’, {‘class’: ‘time’}).get_text() # 日期信息

article.about = page.find(‘blockquote’).get_text()

pnode = page.find(‘div’, {‘class’: ‘article-detail’}).find_all(‘p’)

article.content = ”

for node in pnode: # 获取文章段落

article.content += node.get_text() + ‘n’ # 追加段落信息

savenews(article)

print newscount

break

except exception as e:

print(e)

continue

else:

print(article.title)

newscount+=1

finally:

# 判断数据是否收集完成

if newscount == maxnewscount:

break

def savenews(object):

file.write(“【”+object.title+”】”+”t”)

file.write(object.author+”t”+object.date+”n”)

file.write(object.content+”n”+”n”)

url_set = set() # url集合

url_old = set() # 爬过的url集合

newscount = 0

maxnewscount=3

home = ‘http://baijia.baidu.com/’ # 起始位置

getallurl(home)

file=codecs.open(“d:\test.txt”,”a+”) #文件操作

for url in url_set:

getnews(url)

# 判断数据是否收集完成

if newscount == maxnewscount:

break

file.close()

新闻文章结构

#coding: utf-8

# 文章类定义

class news(object):

def __init__(self):

self.url = none

self.title = none

self.author = none

self.date = none

self.about = none

self.content = none

对爬取的文章数量就行统计。

文章知识点与官方知识档案匹配，可进一步学习相关知识Python入门技能树首页概览211389 人正在系统学习中相关资源：开源的爬虫软件Heritrix3.1.0_开源爬虫-Java工具类资源-CSDN文库

声明：本站部分文章及图片源自用户投稿，如本站任何资料有侵权请您尽早请联系jinwei@zod.com.cn进行处理,非常感谢！

python爬取图文新闻_python爬取新闻需要什么软件

相关推荐