HtOmArthas
HtOmArthas
采纳率33.3%
2016-12-03 08:44 阅读 2.0k

python爬取大量网页出现卡顿问题

4

用python爬取新浪新闻,在爬到260多篇的时候程序就会卡在正则表达式匹配处,单独爬取该网页没有问题,求问原因
import urllib2
import urllib
import re
import time
from bs4 import BeautifulSoup
def getContent(url,index):
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
myRequest = urllib2.Request(url,headers = headers)
for tries in range(5):

    try:

        myResponse = urllib2.urlopen(myRequest)
        myPage = myResponse.read()
        time.sleep(1)
        break
    except:
        if tries < 5:
            print 'Attempt to reconnect'

            continue
        else:
            print 'connection failed'


textContent = re.findall('<div class="article article_16" id=".*?">(.*?)<p class="article-editor">',myPage,re.S)
while len(textContent) == 0:
    textContent = re.findall('<div class="article article_16" id=".*?">(.*?)<p class="article-editor">',myPage,re.S)
retContent = re.findall('<p.{0,100}?>(.*?)</p>',textContent[0],re.S)
while len(retContent) == 0:
    retContent = re.findall('<p.{0,100}?>(.*?)</p>',textContent[0],re.S)
#print len(retContent)
file = open(str(index)+'.txt','w')
for text in retContent:     
    file.write(text)
file.close() 
print str(index)+'.txt has been written'

#def crawler_urllist(start,end,typeNum):#para type : string
#create date list
partialUrl = ['http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=','&spec=&type=&date=','&ch=01&k=&offset_page=0&offset_num=0&num=40&asc=&page=']
format = "%Y-%m-%d"
beginDate = datetime.datetime.strptime(start,format)
endDate = datetime.datetime.strptime(end,format)
oneDay = datetime.timedelta(days = 1)
diffDay = endDate - beginDate
dateList = []
for i in range(diffDay.days + 1):
dateList.append(datetime.datetime.strftime(beginDate,format))
beginDate += oneDay
retList = []

    for i in range(len(dateList)):
        url = partialUrl[0] + typeNum + partialUrl[1] + dateList[i] + partialUrl[2]
        page = 0
        while True:

            page += 1
            continueFlag,myPage = getContent(url + str(self.page))
            print(url + str(self.page))
            #print self.page
            #print self.initUrl + str(self.page)
            if continueFlag:
                self.pages.append(myPage)
            else:
                self.page = 0
                break                   
        else:
            time.sleep(1)
    date_url_list = open(typeNum + '.txt','w')
    for item in retList:
        date_url_list.write(item)
        date_url_list.write('\n')           
    date_url_list.close()
    return type + '.txt'

i = 1
urlFile = open('DomesticNewsUrlList.txt','r')
naviurlList = urlFile.readlines()
urlList = list(set(naviurlList))[0].split()
#urllist1 = urlList[0:5]
for url in urlList:
getContent(url,i)
#time.sleep(1)
i += 1



  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享

2条回答 默认 最新

  • u010663768 mega4221 2016-12-03 10:59

    给request加上timeout看看,sleep时间在加大一点,正则匹配哪里try一下

    点赞 评论 复制链接分享
  • oyljerry oyljerry 2016-12-04 01:03

    先把匹配前的数据打印出来看看,看是否抓取到数据

    点赞 评论 复制链接分享

相关推荐