Python 使用BeautifulSoup过程中遇到TypeError: object of type 'NoneType' has no len()
需求:获取url_list里每一个url的文件大小、类型和outlink的数量
```python
def getHTML(url, ua_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', num_retries = 5):
headers = {'User-Agent': ua_agent}
request = urllib.request.Request(url=url, headers=headers)
html = None
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError or urllib.error.HTTPError as e:
if num_retries > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
getHTML(url, ua_agent, num_retries - 1)
return html
这里打印html返回None,导致BeautifulSoup获取url出错
def get_url_num(html):
links = []
soup = BeautifulSoup(html,'html.parser')
url_list = soup.find_all('a')
for link in url_list:
link = link.get('href')
if link.startswith('http'):
links.append(link)
url_num = len(links)
return url_num
以下为原代码:
import requests
import pandas as pd
import urllib.error
import urllib.request
import ssl
from bs4 import BeautifulSoup
ssl._create_default_https_context = ssl._create_unverified_context
def getHTML(url, ua_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', num_retries = 5):
headers = {'User-Agent': ua_agent}
request = urllib.request.Request(url=url, headers=headers)
html = None
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError or urllib.error.HTTPError as e:
if num_retries > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
getHTML(url, ua_agent, num_retries - 1)
return html
def get_url_num(html):
links = []
soup = BeautifulSoup(html,'html.parser')
url_list = soup.find_all('a')
for link in url_list:
link = link.get('href')
if link.startswith('http'):
links.append(link)
url_num = len(links)
return url_num
df = pd.read_csv('fetch_nytimes.csv')
url_list = []
for i in df['URL']:
url_list.append(i)
print(url_list)
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
size_list = []
type_list = []
outlinks_list = []
for url in url_list:
try:
response = requests.get(url, stream=True, headers=headers)
# column 2
file_size = response.headers['Content-Length']
file_size = int(file_size)/1024/1024
size_list.append(file_size)
# column 4
file_type = response.headers['Content-Type']
type_list.append(file_type)
# column 3
html = getHTML(url)
outlinks = get_url_num(html)
outlinks_list.append(outlinks)
except urllib.error.URLError or urllib.error.HTTPError as e:
url_list.remove(url)
dict_visit = {}
dict_visit['URL'] = url_list
dict_visit['File_Size'] = size_list
dict_visit['Outlinks'] = outlinks_list
dict_visit['File_Type'] = type_list
df1 = pd.DataFrame(dict_visit)
df1.to_csv('visit_nytimes.csv', index=False)
```