-*- coding:UTF-8 -*-
import urllib2
import re
import jieba
import jieba.analyse
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import xlwt
from xlrd import open_workbook
from xlutils.copy import copy
wordList={}
key_list = []
x=1
for x in range(2):
urlstr='http://yyk.qqyy.com/search_dp0lmhks0i'+str(x+1)+'.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
req = urllib2.Request(
url=urlstr,
headers=headers
)
myResponse = urllib2.urlopen(req)
html = myResponse.read().decode('utf-8')
pattern = re.compile(
'
)
items = re.findall(pattern, html)
# for x in items:
# print x[0],x[1],x[2],x[3],x[4],x[5]
for xdetail in items:
# print xdetail[0], xdetail[1], xdetail[2], xdetail[3], xdetail[4], xdetail[5]
xdetailtext = re.subn(u' |\(|\)|(|)|<.*?>|()|</.*?>', "", xdetail[0])
# 用正则表达式把多余的字符清洗掉
# print xdetailtext[0]
seg_list = jieba.cut(xdetailtext[0], cut_all=True)
for word1 in seg_list:
if wordList.has_key(word1):
wordList[word1] += 1
else:
wordList[word1] = 1
# print wordList
for wordkey in wordList.keys():
print wordkey + ":" + str(wordList[wordkey])
这个是我到分词的程序,就是不会导入到excel中啊