import requests
from lxml import etree
from urllib.parse import urljoin
import queue
# import threading
# import time
new_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
url = 'http://www.biqugev.com/book/40234/'
r = requests.get(url, headers=new_headers)
r.encoding = r.apparent_encoding
html = etree.HTML(r.text)
b = html.xpath("//div[@class='listmain']/dl/dd/a/@href")
b = [urljoin(url,i) for i in b]
q = queue.Queue()
for num in range(len(b)):
qa = dict()
qa['n'] = num
qa['url'] = b[num]
q.put(qa)
def parse_text(n, url):
# print(n)
r = requests.get(url,headers=new_headers)
# print(r.status_code,r.encoding)
html = etree.HTML(r.text)
bb = html.xpath("//div[@class='content']/h1/text()")[0]
aa = html.xpath("//div[@class='content']/div[@id='content']/text()")
print(aa)
# print(bb)
aa = ''.join(aa)
print('合起来:', aa)
# time.sleep(1)
d = q.get()
parse_text(d['n'],d['url'])

遇到关于''.join吃字符串的问题
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- 咕泡-三木 2021-07-24 14:51关注
加一行代码
print(len(aa))
可以看到,aa的长度很长,由此可以判断没有发送你所说的”吃字符串“的问题在加一行代码
repr(aa)
可以看到字符中有大量的特殊符号,导致部分内容显式不出来
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报