第一次用,改一下排版
1
import urllib.request
import re
def into(url):
#url = "http://www.piaofang168.com/"
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
print(html)
return html
def find(url):
#findit = into(url).html
html = into(url)
findit = re.compile('(.*?)', re.S)
# items = re.findall(find, html)
items = re.findall(findit, html)
for item in items:
print(item)
f = open("a.txt", "a")
f.write(item)
f.close()
url = "http://www.piaofang168.com/"
if __name__ == '__main__':
find(url)
2
#!/usr/bin/env python
#coding:utf-8
import urllib.request
from bs4 import BeautifulSoup
def parse_list(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(req, timeout=60)
contents = page.read()
soup = BeautifulSoup(contents, "lxml")
for tag in soup.find_all('div', class_='content-list'):
try:
data_url = tag.h3.a.attrs['href']
except AttributeError:
print("error at:", tag.get_text())
else:
if verbose:
print(data_url)
parse_data(data_base_url+data_url)
def parse_data(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib.request.Request(url, headers=headers)
page = urllib.request.urlopen(req, timeout=60)
try:
contents = page.read().decode('UTF-8')
except UnicodeDecodeError:
print("UnicodeDecodeError: " + url)
else:
soup = BeautifulSoup(contents, "lxml")
try:
tag = soup.find('div', id='homepost')
# if verbose:
# print(tag)
title = tag.find('div', class_='toptit').h2.get_text()
if verbose:
print(title)
trs_left = tag.find('table', class_="infotable").find_all('tr')
if verbose:
print(trs_left)
read_num = trs_left[1].td.span.get_text()
download_num = trs_left[2].td.span.get_text()
download_points = trs_left[3].td.span.get_text()
except AttributeError:
print("error at:", url)
else:
write_data(title, read_num, download_num, download_points, url)
def write_data(title, read_num, download_num, download_points, url):
f.write(title + "," + read_num + "," + download_num + "," + download_points + "," + url + "\n")
base_url = 'http://www.codeforge.cn/l/0/c/0/t/0/v/0/p/'
data_base_url = 'http://www.codeforge.cn'
f = open('data.csv', 'w')
verbose = False
if __name__ == '__main__':
f.write("title, read_num, download_num, download_points, url \n")
for i in range(1000):
parse_list(base_url + str(i))
f.flush()
print("has finish %s" % str((i+1)*10))