Python 抓取网站汉字

Python 抓取网站汉字

import requests
import json
import re
import time
from bs4 import BeautifulSoup
import os



def get_html(url):
    headers = {
        'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
        AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
 
    }     #模拟浏览器访问
    response = requests.get(url,headers = headers)       #请求访问网站
    html = response.text       #获取网页源码
    return html                #返回网页源码


file = open("urls.txt")  #打开存放链接的TXT文档
while 1:
    
    soup = BeautifulSoup("file.readline()",'lxml')

#url = 'https://web.archive.org/web/20151218011223/http://mofatie.net'            # 调用文件的 readline()方法 
#res = requests.get (url)        #获取响应码,查看是否成功响应,结果为200则为成功响应
#soup = BeautifulSoup(res.text,'html.parser')
#print(get_html('https://web.archive.org/web/20151218011223/http://mofatie.net'))
for li in soup.find_all(name='li'):         #遍历父节点
        for a in li.find_all(name='a'):     #遍历子节点
            if a.string==None:
                pass
            else:
                print(a.string)      #输出结果


# Script   Python  

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×