1.获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数。
2.获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数。
def printnews(url): resp=requests.get(url) resp.encoding='utf-8' soupp =BeautifulSoup(res.text,'html.parser') for news in soup.select('li'): if len((news.select('.news-list-title')))>0: time=news.select('.news-list-info')[0].span.text#时间 title=news.select('.news-list-title')[0].contents[0]#标题 url=news.select('a')[0]['href']#链接 source=news.select('.news-list-info')[0].select('span')[1].text detail=getdetail(url) dtime=getdetailtime(url) addt=time+'-'+dtime dt=datetime.strptime(addt,'%Y-%m-%d-%H:%M:%S')#将其中的时间str转换成datetime类型 curl=getclickurl(url) click=int(requests.get(getclickurl(url)).text.split('.')[-1].lstrip("html('").rstrip("');")) print( "时间:",time,"\t标题:",title,"\t链接:",url,"\t点击次数:",click)#,"\n详情:",detail print("---------------------|这是一条可爱的分割线|----------------------------")
3.获取所有新闻列表页的网址,调用上述函数。
import requestsfrom bs4 import BeautifulSoupfrom datetime import datetimeimport regzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'res=requests.get(gzccurl)res.encoding='utf-8'soup =BeautifulSoup(res.text,'html.parser')def getpage(): lists=int(soup.select('.a1')[0].text.rstrip("条")) page=lists//10+1 return pagefor i in range(2,getpage()+1): listurl=('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i))
4.完成所有校园新闻的爬取工作。
import requestsfrom bs4 import BeautifulSoupfrom datetime import datetimeimport regzccurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'res=requests.get(gzccurl)res.encoding='utf-8'soup =BeautifulSoup(res.text,'html.parser')print("---------------------|--------------------|----------------------------\n---------------------|这是一条可爱的分割线|----------------------------\n---------------------|____________________|----------------------------")def getdetail(url):#将取得详细内容的代码包装成函数 resn=requests.get(url) resn.encoding='utf-8' soupn=BeautifulSoup(resn.text,'html.parser') detail=soupn.select('.show-content')[0].text return (detail)def getdetailtime(url):#将取得详细时间的代码包装成函数 rest=requests.get(url) rest.encoding='utf-8' soupt=BeautifulSoup(rest.text,'html.parser') detailtime=soupt.select('.show-info')[0].text[16:24] return(detailtime)def getclickurl(url):# id=re.match('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html',url).groups() clickurl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id[0].split('/')[1]) return(clickurl)def printnews(url): resp=requests.get(url) resp.encoding='utf-8' soupp =BeautifulSoup(res.text,'html.parser') for news in soup.select('li'): if len((news.select('.news-list-title')))>0: time=news.select('.news-list-info')[0].span.text#时间 title=news.select('.news-list-title')[0].contents[0]#标题 url=news.select('a')[0]['href']#链接 source=news.select('.news-list-info')[0].select('span')[1].text detail=getdetail(url) dtime=getdetailtime(url) addt=time+'-'+dtime dt=datetime.strptime(addt,'%Y-%m-%d-%H:%M:%S')#将其中的时间str转换成datetime类型 curl=getclickurl(url) click=int(requests.get(getclickurl(url)).text.split('.')[-1].lstrip("html('").rstrip("');")) print( "时间:",time,"\t标题:",title,"\t链接:",url,"\t点击次数:",click,"\n详情:",detail) print("---------------------|这是一条可爱的分割线|----------------------------") def getpage(): lists=int(soup.select('.a1')[0].text.rstrip("条")) page=lists//10+1 return pageprintnews(gzccurl)for i in range(2,getpage()+1): listurl=('http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)) printnews(listurl)
5.完成自己所选其他主题相应数据的爬取工作。
import requestsfrom bs4 import BeautifulSoupfrom datetime import datetimeimport reaopaurl='http://www.aopa.org.cn/news/list.php?catid=8'res=requests.get(aopaurl)res.encoding='utf-8'soup =BeautifulSoup(res.text,'html.parser')def getdetail(url):#将取得详细内容的代码包装成函数 resn=requests.get(url) resn.encoding='utf-8' soupn=BeautifulSoup(resn.text,'html.parser') detail=soupn.select('.content')[0].text return (detail)def printnews(url): res=requests.get(url) res.encoding='utf-8' soup =BeautifulSoup(res.text,'html.parser') for news in soup.select('.catlist'): for news2 in news.select('.catlist_li'): if len((news2))>0: time=news2.select('span')[0].text #时间 title=news2.select('a')[0]['title']#标题 url=news2.select('a')[0]['href']#链接 dt=datetime.strptime(time,'%Y-%m-%d %H:%M')#将其中的时间str转换成datetime类型 detail=getdetail(url) print( "时间:",dt,"\t标题:",title,"\t链接:",url)#,"详情:",detailprintnews(aopaurl)for i in range(2,44): listurl=('http://www.aopa.org.cn/news/list-8-{}.html'.format(i)) printnews(listurl)