用python爬取新浪新闻,在爬到260多篇的时候程序就会卡在正则表达式匹配处,单独爬取该网页没有问题,求问原因
import urllib2
import urllib
import re
import time
from bs4 import BeautifulSoup
def getContent(url,index):
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
myRequest = urllib2.Request(url,headers = headers)
for tries in range(5):
try:
myResponse = urllib2.urlopen(myRequest)
myPage = myResponse.read()
time.sleep(1)
break
except:
if tries < 5:
print 'Attempt to reconnect'
continue
else:
print 'connection failed'
textContent = re.findall('<div class="article article_16" id=".*?">(.*?)<p class="article-editor">',myPage,re.S)
while len(textContent) == 0:
textContent = re.findall('<div class="article article_16" id=".*?">(.*?)<p class="article-editor">',myPage,re.S)
retContent = re.findall('<p.{0,100}?>(.*?)</p>',textContent[0],re.S)
while len(retContent) == 0:
retContent = re.findall('<p.{0,100}?>(.*?)</p>',textContent[0],re.S)
#print len(retContent)
file = open(str(index)+'.txt','w')
for text in retContent:
file.write(text)
file.close()
print str(index)+'.txt has been written'
#def crawler_urllist(start,end,typeNum):#para type : string
#create date list
partialUrl = ['http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=','&spec=&type=&date=','&ch=01&k=&offset_page=0&offset_num=0&num=40&asc=&page=']
format = "%Y-%m-%d"
beginDate = datetime.datetime.strptime(start,format)
endDate = datetime.datetime.strptime(end,format)
oneDay = datetime.timedelta(days = 1)
diffDay = endDate - beginDate
dateList = []
for i in range(diffDay.days + 1):
dateList.append(datetime.datetime.strftime(beginDate,format))
beginDate += oneDay
retList = []
for i in range(len(dateList)):
url = partialUrl[0] + typeNum + partialUrl[1] + dateList[i] + partialUrl[2]
page = 0
while True:
page += 1
continueFlag,myPage = getContent(url + str(self.page))
print(url + str(self.page))
#print self.page
#print self.initUrl + str(self.page)
if continueFlag:
self.pages.append(myPage)
else:
self.page = 0
break
else:
time.sleep(1)
date_url_list = open(typeNum + '.txt','w')
for item in retList:
date_url_list.write(item)
date_url_list.write('\n')
date_url_list.close()
return type + '.txt'
i = 1
urlFile = open('DomesticNewsUrlList.txt','r')
naviurlList = urlFile.readlines()
urlList = list(set(naviurlList))[0].split()
#urllist1 = urlList[0:5]
for url in urlList:
getContent(url,i)
#time.sleep(1)
i += 1