Python 抓取网站汉字
import requests
import json
import re
import time
from bs4 import BeautifulSoup
import os
def get_html(url):
headers = {
'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
} #模拟浏览器访问
response = requests.get(url,headers = headers) #请求访问网站
html = response.text #获取网页源码
return html #返回网页源码
file = open("urls.txt") #打开存放链接的TXT文档
while 1:
soup = BeautifulSoup("file.readline()",'lxml')
#url = 'https://web.archive.org/web/20151218011223/http://mofatie.net' # 调用文件的 readline()方法
#res = requests.get (url) #获取响应码,查看是否成功响应,结果为200则为成功响应
#soup = BeautifulSoup(res.text,'html.parser')
#print(get_html('https://web.archive.org/web/20151218011223/http://mofatie.net'))
for li in soup.find_all(name='li'): #遍历父节点
for a in li.find_all(name='a'): #遍历子节点
if a.string==None:
pass
else:
print(a.string) #输出结果