程序源代码
import requests
import re
# 头部
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'}
url = 'https://815864.yichafen.com'
def ask(primary_url):
# 爬
got = requests.get(primary_url, headers=header)
# 源代码
text = got.text
print(text)
return text
def analysis(code):
# 源代码
source_code = code
# 寻找超链接
# 正则
re_test_name = r'<a style="margin-left:10px;"\t\t\t\t\thref=".+" target="_blank">.{0,30}</a>'
# 找
results = re.findall(re_test_name, source_code)
print(results)
def main():
analysis(ask(url))
if __name__ == '__main__':
main()
访问几次之后的结果,其中标红部分是每次都会改变的
但是在浏览器中不会被阻止
请问服务器是怎么区分浏览器和爬虫的?