新手写了一个抓取网址的爬虫,但是发现 for url, favorites in zip(child_url_result, child_url_favorites_result):这个循环没有进入或者进入之后什么都没做就出去了,是迭代器在嵌套for循环内有什么规则吗?我试了上面的[a,b,c]循环可以进入并打印,请问是什么问题
class Crawler(object):
def __init__(self): # __init__,构造,初始化,实例化。
self.child_url_no = 1
self.top_videos_home = 'http://#####/v.php?category=rf&viewtype=basic&page='
self.headers = {'Cookie': 'Cookie',
'User-Agent': 'User-Agent',
'Refer': 'Refer'
}
self.child_url_list = []
self.child_url_favorites_list = []
def get_child_url_list(self, page_num):
for x in range(page_num): # 用page_num循环页数
# 获取网页源码,用str(self.child_url_no)改变网址页码并遍历
child_url_resp = requests.get(self.top_videos_home + str(self.child_url_no), verify=False, headers=self.headers)
# 获取两个迭代器用于筛选
child_url_favorites_resp = child_url_resp
child_url_resp.encoding = 'utf-8'
child_url_favorites_resp.encoding = 'utf-8'
time.sleep(2)
# 建立正则表达规则
child_url_object = re.compile(r'well-sm videos-text-align">\
\
<a href="(?P<child_url>.*?)">', re.S)
child_url_result = child_url_object.finditer(child_url_resp.text)
child_url_favorites_object = re.compile('Favorites:</span> (?P<child_url_favorites>.*?) <br>', re.S)
child_url_favorites_result = child_url_favorites_object.finditer(child_url_favorites_resp.text)
for i in ['a', 'b', 'c']: # 测试用
print(i) #正常打印
for url, favorites in zip(child_url_result, child_url_favorites_result): # 该循环没有进入或者进入无动作
self.child_url_list.append(url.group('child_url')) # 列表为空
self.child_url_favorites_list.append(favorites.group('child_url_favorites')) # 列表为空
self.child_url_no += 1 # 正常递增
if __name__ == '__main__':
C9PV = Crawler()
C9PV.get_child_url_list(3)