0


python 爬虫代码

一、代码

1、爬虫_urllib_基本使用

# 使用urllib来获取百度首页的源码import urllib.request

# (1)定义一个url  就是你要访问的地址
url ='http://www.baidu.com'# (2)模拟浏览器向服务器发送请求 response响应
response = urllib.request.urlopen(url)# (3)获取响应中的页面的源码  content 内容的意思# read方法  返回的是字节形式的二进制数据# 我们要将二进制的数据转换为字符串# 二进制--》字符串  解码  decode('编码的格式')
content = response.read().decode('utf-8')# (4)打印数据
print(content)

2、爬虫_urllib_1个类型和6个方法

import urllib.request

url ='http://www.baidu.com'# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(url)# 一个类型和六个方法# response是HTTPResponse的类型# print(type(response))# 按照一个字节一个字节的去读# content = response.read()# print(content)# 返回多少个字节# content = response.read(5)# print(content)# 读取一行# content = response.readline()# print(content)# content = response.readlines()# print(content)# 返回状态码  如果是200了 那么就证明我们的逻辑没有错# print(response.getcode())# 返回的是url地址# print(response.geturl())# 获取是一个状态信息
print(response.getheaders())# 一个类型 HTTPResponse# 六个方法 read  readline  readlines  getcode geturl getheaders

3、爬虫_urllib_下载

import urllib.request

# 下载网页# url_page = 'http://www.baidu.com'# url代表的是下载的路径  filename文件的名字# 在python中 可以变量的名字  也可以直接写值# urllib.request.urlretrieve(url_page,'baidu.html')# 下载图片# url_img = 'https://img1.baidu.com/it/u=3004965690,4089234593&fm=26&fmt=auto&gp=0.jpg'## urllib.request.urlretrieve(url= url_img,filename='lisa.jpg')# 下载视频
url_video ='https://vd3.bdstatic.com/mda-mhkku4ndaka5etk3/1080p/cae_h264/1629557146541497769/mda-mhkku4ndaka5etk3.mp4?v_from_s=hkapp-haokan-tucheng&auth_key=1629687514-0-0-7ed57ed7d1168bb1f06d18a4ea214300&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest='

urllib.request.urlretrieve(url_video,'hxekyyds.mp4')

4、爬虫_urllib_请求对象的定制

import urllib.request

url ='https://www.baidu.com'# url的组成# https://www.baidu.com/s?wd=周杰伦# http/https    www.baidu.com   80/443     s      wd = 周杰伦     ##    协议             主机        端口号     路径     参数           锚点# http   80# https  443# mysql  3306# oracle 1521# redis  6379# mongodb 27017

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 因为urlopen方法中不能存储字典 所以headers不能传递进去# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf8')

print(content)

5、爬虫_urllib_get请求的quote方法

# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6# 需求 获取 https://www.baidu.com/s?wd=周杰伦的网页源码import urllib.request
import urllib.parse

url ='https://www.baidu.com/s?wd='# 请求对象的定制为了解决反爬的第一种手段
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 将周杰伦三个字变成unicode编码的格式# 我们需要依赖于urllib.parse
name = urllib.parse.quote('周杰伦')

url = url + name

# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)# 获取响应的内容
content = response.read().decode('utf-8')# 打印数据
print(content)

6、爬虫_urllib_get请求的urlencode方法

# urlencode应用场景:多个参数的时候# https://www.baidu.com/s?wd=周杰伦&sex=男# import urllib.parse## data = {#     'wd':'周杰伦',#     'sex':'男',#     'location':'中国台湾省'# }## a = urllib.parse.urlencode(data)# print(a)#获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7的网页源码import urllib.request
import urllib.parse

base_url ='https://www.baidu.com/s?'

data ={'wd':'周杰伦',
    'sex':'男',
    'location':'中国台湾省'}

new_data = urllib.parse.urlencode(data)# 请求资源路径
url = base_url + new_data

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)# 获取网页源码的数据
content = response.read().decode('utf-8')# 打印数据
print(content)

7、爬虫_urllib_post请求百度翻译

# post请求import urllib.request
import urllib.parse

url ='https://fanyi.baidu.com/sug'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

data ={'kw':'spider'}# post请求的参数 必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')# post的请求的参数 是不会拼接在url的后面的  而是需要放在请求对象定制的参数中# post请求的参数 必须要进行编码
request = urllib.request.Request(url=url,data=data,headers=headers)# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)# 获取响应的数据
content = response.read().decode('utf-8')# 字符串--》json对象import json

obj = json.loads(content)
print(obj)# post请求方式的参数 必须编码   data = urllib.parse.urlencode(data)# 编码之后 必须调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8')# 参数是放在请求对象定制的方法中  request = urllib.request.Request(url=url,data=data,headers=headers)

8、爬虫_urllib_post请求百度翻译之详细翻译

import urllib.request
import urllib.parse

url ='https://fanyi.baidu.com/v2transapi?from=en&to=zh'

headers ={# 'Accept': '*/*',# 'Accept-Encoding': 'gzip, deflate, br',# 'Accept-Language': 'zh-CN,zh;q=0.9',# 'Connection': 'keep-alive',# 'Content-Length': '135',# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Cookie':'BIDUPSID=DAA8F9F0BD801A2929D96D69CF7EBF50; PSTM=1597202227; BAIDUID=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; __yjs_duid=1_c19765bd685fa6fa12c2853fc392f8db1618999058029; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDUSS_BFESS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=34435_31660_34405_34004_34073_34092_26350_34426_34323_22158_34390; delPer=1; BA_HECTOR=8185a12020018421b61gi6ka20q; BCLID=10943521300863382545; BDSFRCVID=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; BCLID_BFESS=10943521300863382545; BDSFRCVID_BFESS=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629701482,1629702031,1629702343,1629704515; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629704515; __yjs_st=2_MDBkZDdkNzg4YzYyZGU2NTM5NzBjZmQ0OTZiMWRmZGUxM2QwYzkwZTc2NTZmMmIxNDJkYzk4NzU1ZDUzN2U3Yjc4ZTJmYjE1YTUzMTljYWFkMWUwYmVmZGEzNmZjN2FlY2M3NDAzOThhZTY5NzI0MjVkMmQ0NWU3MWE1YTJmNGE5NDBhYjVlOWY3MTFiMWNjYTVhYWI0YThlMDVjODBkNWU2NjMwMzY2MjFhZDNkMzVhNGMzMGZkMWY2NjU5YzkxMDk3NTEzODJiZWUyMjEyYTk5YzY4ODUyYzNjZTJjMGM5MzhhMWE5YjU3NTM3NWZiOWQxNmU3MDVkODExYzFjN183XzliY2RhYjgz; ab_sr=1.0.1_ZTc2ZDFkMTU5ZTM0ZTM4MWVlNDU2MGEzYTM4MzZiY2I2MDIxNzY1Nzc1OWZjZGNiZWRhYjU5ZjYwZmNjMTE2ZjIzNmQxMTdiMzIzYTgzZjVjMTY0ZjM1YjMwZTdjMjhiNDRmN2QzMjMwNWRhZmUxYTJjZjZhNTViMGM2ODFlYjE5YTlmMWRjZDAwZGFmMDY4ZTFlNGJiZjU5YzE1MGIxN2FiYTU3NDgzZmI4MDdhMDM5NTQ0MjQxNDBiNzdhMDdl',
    # 'Host': 'fanyi.baidu.com',# 'Origin': 'https://fanyi.baidu.com',# 'Referer': 'https://fanyi.baidu.com/?aldtype=16047',# 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',# 'sec-ch-ua-mobile': '?0',# 'Sec-Fetch-Dest': 'empty',# 'Sec-Fetch-Mode': 'cors',# 'Sec-Fetch-Site': 'same-origin',# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',# 'X-Requested-With': 'XMLHttpRequest',}

data ={'from':'en',
    'to':'zh',
    'query':'love',
    'transtype':'realtime',
    'simple_means_flag':'3',
    'sign':'198772.518981',
    'token':'......',
    'domain':'common',
}# post请求的参数  必须进行编码 并且要调用encode方法
data = urllib.parse.urlencode(data).encode('utf-8')# 请求对象的定制
request = urllib.request.Request(url = url,data = data,headers = headers)# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)# 获取响应的数据
content = response.read().decode('utf-8')import json

obj = json.loads(content)
print(obj)

9、爬虫_urllib_ajax的get请求豆瓣电影第一页

# get请求# 获取豆瓣电影的第一页的数据 并且保存起来import urllib.request

url ='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# (1) 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)# (2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')# (3) 数据下载到本地# open方法默认情况下使用的是gbk的编码  如果我们要想保存汉字 那么需要在open方法中指定编码格式为utf-8# encoding = 'utf-8'# fp = open('douban.json','w',encoding='utf-8')# fp.write(content)

with open('douban1.json','w',encoding='utf-8') as fp:
    fp.write(content)

10、爬虫_urllib_ajax的get请求豆瓣电影前10页

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&# start=0&limit=20# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&# start=20&limit=20# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&# start=40&limit=20# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&# start=60&limit=20# page    1  2   3   4# start   0  20  40  60# start (page - 1)*20# 下载豆瓣电影前10页的数据# (1) 请求对象的定制# (2) 获取响应的数据# (3) 下载数据import urllib.parse
import urllib.request

def create_request(page):
    base_url ='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'

    data ={'start':(page - 1) * 20,
        'limit':20
    }

    data = urllib.parse.urlencode(data)

    url = base_url + data

    headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

    request = urllib.request.Request(url=url,headers=headers)return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')return content

def down_load(page,content):
    with open('douban_' + str(page) + '.json','w',encoding='utf-8')as fp:
        fp.write(content)# 程序的入口if __name__ =='__main__':
    start_page = int(input('请输入起始的页码'))
    end_page = int(input('请输入结束的页面'))forpagein range(start_page,end_page+1):
#         每一页都有自己的请求对象的定制
        request = create_request(page)#         获取响应的数据
        content = get_content(request)#         下载
        down_load(page,content)

11、爬虫_urllib_ajax的post请求肯德基官网

# 1页# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname# post# cname: 北京# pid:# pageIndex: 1# pageSize: 10# 2页# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname# post# cname: 北京# pid:# pageIndex: 2# pageSize: 10import urllib.request
import urllib.parse

def create_request(page):
    base_url ='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

    data ={'cname':'北京',
        'pid':'',
        'pageIndex': page,
        'pageSize':'10'}

    data = urllib.parse.urlencode(data).encode('utf-8')

    headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

    request = urllib.request.Request(url=base_url,headers=headers,data=data)return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')return content

def down_load(page,content):
    with open('kfc_' + str(page) + '.json','w',encoding='utf-8')as fp:
        fp.write(content)if __name__ =='__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))forpagein range(start_page,end_page+1):
        # 请求对象的定制
        request = create_request(page)# 获取网页源码
        content = get_content(request)# 下载
        down_load(page,content)

12、爬虫_urllib_异常

import urllib.request
import urllib.error

# url = 'https://blog.csdn.net/sulixu/article/details/1198189491'

url ='http://www.doudan1111.com'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

try:
    request = urllib.request.Request(url = url, headers = headers)

    response = urllib.request.urlopen(request)

    content = response.read().decode('utf-8')

    print(content)
except urllib.error.HTTPError:
    print('系统正在升级。。。')
except urllib.error.URLError:
    print('我都说了 系统正在升级。。。')

13、爬虫_urllib_微博的cookie登陆

# 适用的场景:数据采集的时候 需要绕过登陆 然后进入到某个页面# 个人信息页面是utf-8  但是还报错了编码错误  因为并没有进入到个人信息页面 而是跳转到了登陆页面# 那么登陆页面不是utf-8  所以报错# 什么情况下访问不成功?# 因为请求头的信息不够  所以访问不成功import urllib.request

url ='https://weibo.cn/6451491586/info'

headers ={# ':authority': 'weibo.cn',# ':method': 'GET',# ':path': '/6451491586/info',# ':scheme': 'https','accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding': 'gzip, deflate, br','accept-language':'zh-CN,zh;q=0.9',
'cache-control':'max-age=0',
#     cookie中携带着你的登陆信息   如果有登陆之后的cookie  那么我们就可以携带着cookie进入到任何页面'cookie':'_T_WM=24c44910ba98d188fced94ba0da5960e; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFxxfgNNUmXi4YiaYZKr_J_5NHD95QcSh-pSh.pSKncWs4DqcjiqgSXIgvVPcpD; SUB=_2A25MKKG_DeRhGeBK7lMV-S_JwzqIHXVv0s_3rDV6PUJbktCOLXL2kW1NR6e0UHkCGcyvxTYyKB2OV9aloJJ7mUNz; SSOLoginState=1630327279',
# referer  判断当前路径是不是由上一个路径进来的    一般情况下 是做图片防盗链'referer':'https://weibo.cn/',
'sec-ch-ua':'"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile':'?0',
'sec-fetch-dest':'document',
'sec-fetch-mode':'navigate',
'sec-fetch-site':'same-origin',
'sec-fetch-user':'?1',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)# 获取响应的数据
content = response.read().decode('utf-8')# 将数据保存到本地
with open('weibo.html','w',encoding='utf-8')as fp:
    fp.write(content)

14、爬虫_urllib_handler处理器的基本使用

# 需求 使用handler来访问百度  获取网页源码import urllib.request

url ='http://www.baidu.com'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

request = urllib.request.Request(url = url,headers = headers)# handler   build_opener  open# (1)获取hanlder对象
handler = urllib.request.HTTPHandler()# (2)获取opener对象
opener = urllib.request.build_opener(handler)# (3) 调用open方法
response = opener.open(request)

content = response.read().decode('utf-8')

print(content)

15、爬虫_urllib_代理

import urllib.request

url ='http://www.baidu.com/s?wd=ip'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 请求对象的定制
request = urllib.request.Request(url = url,headers= headers)# 模拟浏览器访问服务器# response = urllib.request.urlopen(request)

proxies ={'http':'118.24.219.151:16817'}# handler  build_opener  open
handler = urllib.request.ProxyHandler(proxies = proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)# 获取响应的信息
content = response.read().decode('utf-8')# 保存
with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

16、爬虫_urllib_代理池

import urllib.request

proxies_pool =[{'http':'118.24.219.151:16817'},
    {'http':'118.24.219.151:16817'},
]import random

proxies = random.choice(proxies_pool)

url ='http://www.baidu.com/s?wd=ip'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

request = urllib.request.Request(url = url,headers=headers)

handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

17、爬虫_解析_xpath的基本使用

<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"/><title>Title</title></head><body><ul><li id="l1"class="c1">北京</li><li id="l2">上海</li><li id="c3">深圳</li><li id="c4">武汉</li></ul><!--    <ul>--><!--        <li>大连</li>--><!--        <li>锦州</li>--><!--        <li>沈阳</li>--><!--    </ul>--></body></html>

18、爬虫_解析_xpath的基本使用

from lxml import etree

# xpath解析# (1)本地文件                                                etree.parse# (2)服务器响应的数据  response.read().decode('utf-8') *****   etree.HTML()# xpath解析本地文件
tree = etree.parse('爬虫_解析_xpath的基本使用.html')#tree.xpath('xpath路径')# 查找ul下面的li# li_list = tree.xpath('//body/ul/li')# 查找所有有id的属性的li标签# text()获取标签中的内容# li_list = tree.xpath('//ul/li[@id]/text()')# 找到id为l1的li标签  注意引号的问题# li_list = tree.xpath('//ul/li[@id="l1"]/text()')# 查找到id为l1的li标签的class的属性值# li = tree.xpath('//ul/li[@id="l1"]/@class')# 查询id中包含l的li标签# li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')# 查询id的值以l开头的li标签# li_list = tree.xpath('//ul/li[starts-with(@id,"c")]/text()')#查询id为l1和class为c1的# li_list = tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()')

li_list = tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="l2"]/text()')# 判断列表的长度
print(li_list)
print(len(li_list))

19、爬虫_解析_获取百度网站的百度一下

# (1) 获取网页的源码# (2) 解析   解析的服务器响应的文件  etree.HTML# (3)  打印import urllib.request

url ='https://www.baidu.com/'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 请求对象的定制
request = urllib.request.Request(url = url,headers = headers)# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)# 获取网页源码
content = response.read().decode('utf-8')# 解析网页源码 来获取我们想要的数据
from lxml import etree

# 解析服务器响应的文件
tree = etree.HTML(content)# 获取想要的数据  xpath的返回值是一个列表类型的数据
result = tree.xpath('//input[@id="su"]/@value')[0]

print(result)

20、爬虫_解析_站长素材

# (1) 请求对象的定制# (2)获取网页的源码# (3)下载# 需求 下载的前十页的图片# https://sc.chinaz.com/tupian/qinglvtupian.html   1# https://sc.chinaz.com/tupian/qinglvtupian_page.htmlimport urllib.request
from lxml import etree

def create_request(page):
    if(page ==1):
        url ='https://sc.chinaz.com/tupian/qinglvtupian.html'
    else:
        url ='https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'

    headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    }

    request = urllib.request.Request(url = url, headers = headers)return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')return content

def down_load(content):
#     下载图片# urllib.request.urlretrieve('图片地址','文件的名字')
    tree = etree.HTML(content)

    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')# 一般设计图片的网站都会进行懒加载
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')foriin range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url ='https:' + src

        urllib.request.urlretrieve(url=url,filename='./loveImg/' + name + '.jpg')if __name__ =='__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))forpagein range(start_page,end_page+1):
        # (1) 请求对象的定制
        request = create_request(page)# (2)获取网页的源码
        content = get_content(request)# (3)下载
        down_load(content)

21、爬虫_解析_jsonpath

{"store":{"book":[{"category":"修真",
        "author":"六道",
        "title":"坏蛋是怎样练成的",
        "price":8.95},
      {"category":"修真",
        "author":"天蚕土豆",
        "title":"斗破苍穹",
        "price":12.99},
      {"category":"修真",
        "author":"唐家三少",
        "title":"斗罗大陆",
        "isbn":"0-553-21311-3",
        "price":8.99},
      {"category":"修真",
        "author":"南派三叔",
        "title":"星辰变",
        "isbn":"0-395-19395-8",
        "price":22.99}],
    "bicycle":{"author":"老马",
      "color":"黑色",
      "price":19.95}}}

22、爬虫_解析_jsonpath

import json
import jsonpath

obj = json.load(open('爬虫_解析_jsonpath.json','r',encoding='utf-8'))# 书店所有书的作者# author_list = jsonpath.jsonpath(obj,'$.store.book[*].author')# print(author_list)# 所有的作者# author_list = jsonpath.jsonpath(obj,'$..author')# print(author_list)# store下面的所有的元素# tag_list = jsonpath.jsonpath(obj,'$.store.*')# print(tag_list)# store里面所有东西的price# price_list = jsonpath.jsonpath(obj,'$.store..price')# print(price_list)# 第三个书# book = jsonpath.jsonpath(obj,'$..book[2]')# print(book)# 最后一本书# book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')# print(book)#     前面的两本书# book_list = jsonpath.jsonpath(obj,'$..book[0,1]')# book_list = jsonpath.jsonpath(obj,'$..book[:2]')# print(book_list)# 条件过滤需要在()的前面添加一个?#      过滤出所有的包含isbn的书。# book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')# print(book_list)# 哪本书超过了10块钱
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
print(book_list)

23、爬虫_解析_jsonpath解析淘票票

{"returnCode":"0","returnValue":{"A":[{"id":3643,"parentId":0,"regionName":"阿坝","cityCode":513200,"pinYin":"ABA"},{"id":3090,"parentId":0,"regionName":"阿克苏","cityCode":652900,"pinYin":"AKESU"},{"id":3632,"parentId":0,"regionName":"阿拉善","cityCode":152900,"pinYin":"ALASHAN"},{"id":899,"parentId":0,"regionName":"安康","cityCode":610900,"pinYin":"ANKANG"},{"id":196,"parentId":0,"regionName":"安庆","cityCode":340800,"pinYin":"ANQING"},{"id":758,"parentId":0,"regionName":"鞍山","cityCode":210300,"pinYin":"ANSHAN"},{"id":388,"parentId":0,"regionName":"安顺","cityCode":520400,"pinYin":"ANSHUN"},{"id":454,"parentId":0,"regionName":"安阳","cityCode":410500,"pinYin":"ANYANG"}],"B":[{"id":3633,"parentId":0,"regionName":"白城","cityCode":220800,"pinYin":"BAICHENG"},{"id":356,"parentId":0,"regionName":"百色","cityCode":451000,"pinYin":"BAISE"},{"id":634,"parentId":0,"regionName":"白山","cityCode":220600,"pinYin":"BAISHAN"},{"id":275,"parentId":0,"regionName":"白银","cityCode":620400,"pinYin":"BAIYIN"},{"id":426,"parentId":0,"regionName":"保定","cityCode":130600,"pinYin":"BAODING"},{"id":188,"parentId":0,"regionName":"宝鸡","cityCode":610300,"pinYin":"BAOJI"},{"id":994,"parentId":0,"regionName":"保山","cityCode":530500,"pinYin":"BAOSHAN"},{"id":1181,"parentId":0,"regionName":"包头","cityCode":150200,"pinYin":"BAOTOU"},{"id":789,"parentId":0,"regionName":"巴彦淖尔","cityCode":150800,"pinYin":"BAYANNAOER"},{"id":925,"parentId":0,"regionName":"巴中","cityCode":511900,"pinYin":"BAZHONG"},{"id":358,"parentId":0,"regionName":"北海","cityCode":450500,"pinYin":"BEIHAI"},{"id":3,"parentId":0,"regionName":"北京","cityCode":110100,"pinYin":"BEIJING"},{"id":200,"parentId":0,"regionName":"蚌埠","cityCode":340300,"pinYin":"BENGBU"},{"id":760,"parentId":0,"regionName":"本溪","cityCode":210500,"pinYin":"BENXI"},{"id":390,"parentId":0,"regionName":"毕节","cityCode":522401,"pinYin":"BIJIE"},{"id":824,"parentId":0,"regionName":"滨州","cityCode":371600,"pinYin":"BINZHOU"},{"id":1126,"parentId":0,"regionName":"亳州","cityCode":341600,"pinYin":"BOZHOU"},{"id":5860,"parentId":0,"regionName":"巴音郭楞","cityCode":652800,"pinYin":"BYGL"}],"C":[{"id":430,"parentId":0,"regionName":"沧州","cityCode":130900,"pinYin":"CANGZHOU"},{"id":623,"parentId":0,"regionName":"长春","cityCode":220100,"pinYin":"CHANGCHUN"},{"id":573,"parentId":0,"regionName":"常德","cityCode":430700,"pinYin":"CHANGDE"},{"id":983,"parentId":0,"regionName":"昌吉","cityCode":652300,"pinYin":"CHANGJI"},{"id":5781,"parentId":0,"regionName":"昌江","cityCode":469026,"pinYin":"CHANGJIANG"},{"id":576,"parentId":0,"regionName":"长沙","cityCode":430100,"pinYin":"CHANGSHA"},{"id":883,"parentId":0,"regionName":"长治","cityCode":140400,"pinYin":"CHANGZHI"},{"id":651,"parentId":0,"regionName":"常州","cityCode":320400,"pinYin":"CHANGZHOU"},{"id":3244,"parentId":0,"regionName":"朝阳","cityCode":211300,"pinYin":"CHAOYANG"},{"id":1138,"parentId":0,"regionName":"潮州","cityCode":445100,"pinYin":"CHAOZHOU"},{"id":433,"parentId":0,"regionName":"承德","cityCode":130800,"pinYin":"CHENGDE"},{"id":70,"parentId":0,"regionName":"成都","cityCode":510100,"pinYin":"CHENGDU"},{"id":5859,"parentId":0,"regionName":"澄迈县","cityCode":469023,"pinYin":"CHENGMAI"},{"id":585,"parentId":0,"regionName":"郴州","cityCode":431000,"pinYin":"CHENZHOU"},{"id":791,"parentId":0,"regionName":"赤峰","cityCode":150400,"pinYin":"CHIFENG"},{"id":205,"parentId":0,"regionName":"池州","cityCode":341700,"pinYin":"CHIZHOU"},{"id":40,"parentId":0,"regionName":"重庆","cityCode":500100,"pinYin":"CHONGQING"},{"id":3640,"parentId":0,"regionName":"崇左","cityCode":451400,"pinYin":"CHONGZUO"},{"id":996,"parentId":0,"regionName":"楚雄","cityCode":532300,"pinYin":"CHUXIONG"},{"id":207,"parentId":0,"regionName":"滁州","cityCode":341100,"pinYin":"CHUZHOU"}],"D":[{"id":998,"parentId":0,"regionName":"大理","cityCode":532900,"pinYin":"DALI"},{"id":763,"parentId":0,"regionName":"大连","cityCode":210200,"pinYin":"DALIAN"},{"id":3071,"parentId":0,"regionName":"儋州","cityCode":460400,"pinYin":"DAN"},{"id":753,"parentId":0,"regionName":"丹东","cityCode":210600,"pinYin":"DANDONG"},{"id":514,"parentId":0,"regionName":"大庆","cityCode":230600,"pinYin":"DAQING"},{"id":885,"parentId":0,"regionName":"大同","cityCode":140200,"pinYin":"DATONG"},{"id":3638,"parentId":0,"regionName":"大兴安岭","cityCode":232700,"pinYin":"DAXINGANLING"},{"id":935,"parentId":0,"regionName":"达州","cityCode":511700,"pinYin":"DAZHOU"},{"id":3650,"parentId":0,"regionName":"德宏","cityCode":533100,"pinYin":"DEHONG"},{"id":937,"parentId":0,"regionName":"德阳","cityCode":510600,"pinYin":"DEYANG"},{"id":827,"parentId":0,"regionName":"德州","cityCode":371400,"pinYin":"DEZHOU"},{"id":5884,"parentId":0,"regionName":"定安","cityCode":469021,"pinYin":"DINGANXIAN"},{"id":1135,"parentId":0,"regionName":"定西","cityCode":621100,"pinYin":"DINGXI"},{"id":1000,"parentId":0,"regionName":"迪庆","cityCode":533400,"pinYin":"DIQINGZANGZU"},{"id":5742,"parentId":0,"regionName":"东方","cityCode":469007,"pinYin":"DONGFANG"},{"id":109,"parentId":0,"regionName":"东莞","cityCode":441900,"pinYin":"DONGGUAN"},{"id":829,"parentId":0,"regionName":"东营","cityCode":370500,"pinYin":"DONGYING"}],"E":[{"id":793,"parentId":0,"regionName":"鄂尔多斯","cityCode":150600,"pinYin":"EERDUOSI"},{"id":541,"parentId":0,"regionName":"恩施","cityCode":422800,"pinYin":"ENSHI"},{"id":543,"parentId":0,"regionName":"鄂州","cityCode":420700,"pinYin":"EZHOU"}],"F":[{"id":360,"parentId":0,"regionName":"防城港","cityCode":450600,"pinYin":"FANGCHENGGANG"},{"id":61,"parentId":0,"regionName":"佛山","cityCode":440600,"pinYin":"FOSHAN"},{"id":770,"parentId":0,"regionName":"抚顺","cityCode":210400,"pinYin":"FUSHUN"},{"id":1176,"parentId":0,"regionName":"阜新","cityCode":210900,"pinYin":"FUXIN"},{"id":1125,"parentId":0,"regionName":"阜阳","cityCode":341200,"pinYin":"FUYANG"},{"id":745,"parentId":0,"regionName":"抚州","cityCode":361000,"pinYin":"FUZHOU"},{"id":98,"parentId":0,"regionName":"福州","cityCode":350100,"pinYin":"FUZHOU"}],"G":[{"id":3658,"parentId":0,"regionName":"甘南","cityCode":623000,"pinYin":"GANNAN"},{"id":718,"parentId":0,"regionName":"赣州","cityCode":360700,"pinYin":"GANZHOU"},{"id":3644,"parentId":0,"regionName":"甘孜","cityCode":513300,"pinYin":"GANZI"},{"id":2166,"parentId":43,"regionName":"巩义市","cityCode":410181,"pinYin":"GONGYI","selected":1},{"id":3642,"parentId":0,"regionName":"广安","cityCode":511600,"pinYin":"GUANGAN"},{"id":3453,"parentId":0,"regionName":"广元","cityCode":510800,"pinYin":"GUANGYUAN"},{"id":8,"parentId":0,"regionName":"广州","cityCode":440100,"pinYin":"GUANGZHOU"},{"id":362,"parentId":0,"regionName":"贵港","cityCode":450800,"pinYin":"GUIGANG"},{"id":364,"parentId":0,"regionName":"桂林","cityCode":450300,"pinYin":"GUILIN"},{"id":394,"parentId":0,"regionName":"贵阳","cityCode":520100,"pinYin":"GUIYANG"},{"id":1183,"parentId":0,"regionName":"固原","cityCode":640400,"pinYin":"GUYUAN"}],"H":[{"id":508,"parentId":0,"regionName":"哈尔滨","cityCode":230100,"pinYin":"HAERBIN"},{"id":3659,"parentId":0,"regionName":"海东","cityCode":630200,"pinYin":"HAIDONG"},{"id":414,"parentId":0,"regionName":"海口","cityCode":460100,"pinYin":"HAIKOU"},{"id":5788,"parentId":0,"regionName":"海南州","cityCode":632500,"pinYin":"HAINANZHOU"},{"id":3665,"parentId":0,"regionName":"海西","cityCode":632800,"pinYin":"HAIXI"},{"id":3669,"parentId":0,"regionName":"哈密","cityCode":652200,"pinYin":"HAMI"},{"id":435,"parentId":0,"regionName":"邯郸","cityCode":130400,"pinYin":"HANDAN"},{"id":16,"parentId":0,"regionName":"杭州","cityCode":330100,"pinYin":"HANGZHOU","selected":0},{"id":902,"parentId":0,"regionName":"汉中","cityCode":610700,"pinYin":"HANZHONG"},{"id":460,"parentId":0,"regionName":"鹤壁","cityCode":410600,"pinYin":"HEBI"},{"id":1144,"parentId":0,"regionName":"河池","cityCode":451200,"pinYin":"HECHI"},{"id":210,"parentId":0,"regionName":"合肥","cityCode":340100,"pinYin":"HEFEI"},{"id":1154,"parentId":0,"regionName":"鹤岗","cityCode":230400,"pinYin":"HEGANG"},{"id":3637,"parentId":0,"regionName":"黑河","cityCode":231100,"pinYin":"HEIHE"},{"id":1148,"parentId":0,"regionName":"衡水","cityCode":131100,"pinYin":"HENGSHUI"},{"id":587,"parentId":0,"regionName":"衡阳","cityCode":430400,"pinYin":"HENGYANG"},{"id":3673,"parentId":0,"regionName":"和田","cityCode":653200,"pinYin":"HETIAN"},{"id":319,"parentId":0,"regionName":"河源","cityCode":441600,"pinYin":"HEYUAN"},{"id":832,"parentId":0,"regionName":"菏泽","cityCode":371700,"pinYin":"HEZE"},{"id":370,"parentId":0,"regionName":"贺州","cityCode":451100,"pinYin":"HEZHOU"},{"id":1002,"parentId":0,"regionName":"红河","cityCode":532500,"pinYin":"HONGHE"},{"id":666,"parentId":0,"regionName":"淮安","cityCode":320800,"pinYin":"HUAIAN"},{"id":1127,"parentId":0,"regionName":"淮北","cityCode":340600,"pinYin":"HUAIBEI"},{"id":590,"parentId":0,"regionName":"怀化","cityCode":431200,"pinYin":"HUAIHUA"},{"id":215,"parentId":0,"regionName":"淮南","cityCode":340400,"pinYin":"HUAINAN"},{"id":547,"parentId":0,"regionName":"黄冈","cityCode":421100,"pinYin":"HUANGGANG"},{"id":3661,"parentId":0,"regionName":"黄南","cityCode":632300,"pinYin":"HUANGNAN"},{"id":217,"parentId":0,"regionName":"黄山","cityCode":341000,"pinYin":"HUANGSHAN"},{"id":550,"parentId":0,"regionName":"黄石","cityCode":420200,"pinYin":"HUANGSHI"},{"id":796,"parentId":0,"regionName":"呼和浩特","cityCode":150100,"pinYin":"HUHEHAOTE"},{"id":163,"parentId":0,"regionName":"惠州","cityCode":441300,"pinYin":"HUIZHOU"},{"id":776,"parentId":0,"regionName":"葫芦岛","cityCode":211400,"pinYin":"HULUDAO"},{"id":801,"parentId":0,"regionName":"呼伦贝尔","cityCode":150700,"pinYin":"HULUNBEIER"},{"id":173,"parentId":0,"regionName":"湖州","cityCode":330500,"pinYin":"HUZHOU"}],"J":[{"id":523,"parentId":0,"regionName":"佳木斯","cityCode":230800,"pinYin":"JIAMUSI"},{"id":747,"parentId":0,"regionName":"吉安","cityCode":360800,"pinYin":"JIAN"},{"id":317,"parentId":0,"regionName":"江门","cityCode":440700,"pinYin":"JIANGMEN"},{"id":462,"parentId":0,"regionName":"焦作","cityCode":410800,"pinYin":"JIAOZUO"},{"id":156,"parentId":0,"regionName":"嘉兴","cityCode":330400,"pinYin":"JIAXING"},{"id":1136,"parentId":0,"regionName":"嘉峪关","cityCode":620200,"pinYin":"JIAYUGUAN"},{"id":327,"parentId":0,"regionName":"揭阳","cityCode":445200,"pinYin":"JIEYANG"},{"id":628,"parentId":0,"regionName":"吉林","cityCode":220200,"pinYin":"JILIN"},{"id":837,"parentId":0,"regionName":"济南","cityCode":370100,"pinYin":"JINAN"},{"id":3556,"parentId":0,"regionName":"金昌","cityCode":620300,"pinYin":"JINCHANG"},{"id":892,"parentId":0,"regionName":"晋城","cityCode":140500,"pinYin":"JINCHENG"},{"id":724,"parentId":0,"regionName":"景德镇","cityCode":360200,"pinYin":"JINGDEZHEN"},{"id":536,"parentId":0,"regionName":"荆门","cityCode":420800,"pinYin":"JINGMEN"},{"id":545,"parentId":0,"regionName":"荆州","cityCode":421000,"pinYin":"JINGZHOU"},{"id":142,"parentId":0,"regionName":"金华","cityCode":330700,"pinYin":"JINHUA"},{"id":842,"parentId":0,"regionName":"济宁","cityCode":370800,"pinYin":"JINING"},{"id":894,"parentId":0,"regionName":"晋中","cityCode":140700,"pinYin":"JINZHONG"},{"id":779,"parentId":0,"regionName":"锦州","cityCode":210700,"pinYin":"JINZHOU"},{"id":726,"parentId":0,"regionName":"九江","cityCode":360400,"pinYin":"JIUJIANG"},{"id":277,"parentId":0,"regionName":"酒泉","cityCode":620900,"pinYin":"JIUQUAN"},{"id":521,"parentId":0,"regionName":"鸡西","cityCode":230300,"pinYin":"JIXI"},{"id":1102,"parentId":0,"regionName":"济源","cityCode":410881,"pinYin":"JIYUAN"}],"K":[{"id":466,"parentId":0,"regionName":"开封","cityCode":410200,"pinYin":"KAIFENG"},{"id":985,"parentId":0,"regionName":"喀什","cityCode":653100,"pinYin":"KASHEN"},{"id":3667,"parentId":0,"regionName":"克拉玛依","cityCode":650200,"pinYin":"KELAMAYI"},{"id":3672,"parentId":0,"regionName":"克孜勒苏柯尔克孜","cityCode":653000,"pinYin":"KEZILESUKEERKEZI"},{"id":18,"parentId":0,"regionName":"昆明","cityCode":530100,"pinYin":"KUNMING"}],"L":[{"id":3639,"parentId":0,"regionName":"来宾","cityCode":451300,"pinYin":"LAIBIN"},{"id":419,"parentId":0,"regionName":"廊坊","cityCode":131000,"pinYin":"LANGFANG"},{"id":279,"parentId":0,"regionName":"兰州","cityCode":620100,"pinYin":"LANZHOU"},{"id":979,"parentId":0,"regionName":"拉萨","cityCode":540100,"pinYin":"LASA"},{"id":940,"parentId":0,"regionName":"乐山","cityCode":511100,"pinYin":"LESHAN"},{"id":3645,"parentId":0,"regionName":"凉山","cityCode":513400,"pinYin":"LIANGSHAN"},{"id":677,"parentId":0,"regionName":"连云港","cityCode":320700,"pinYin":"LIANYUNGANG"},{"id":847,"parentId":0,"regionName":"聊城","cityCode":371500,"pinYin":"LIAOCHENG"},{"id":1178,"parentId":0,"regionName":"辽阳","cityCode":211000,"pinYin":"LIAOYANG"},{"id":630,"parentId":0,"regionName":"辽源","cityCode":220400,"pinYin":"LIAOYUAN"},{"id":992,"parentId":0,"regionName":"丽江","cityCode":530700,"pinYin":"LIJIANG"},{"id":1008,"parentId":0,"regionName":"临沧","cityCode":530900,"pinYin":"LINCANG"},{"id":890,"parentId":0,"regionName":"临汾","cityCode":141000,"pinYin":"LINFEN"},{"id":5590,"parentId":0,"regionName":"临高","cityCode":469024,"pinYin":"LINGAO"},{"id":3498,"parentId":0,"regionName":"临夏","cityCode":622900,"pinYin":"LINXIA"},{"id":849,"parentId":0,"regionName":"临沂","cityCode":371300,"pinYin":"LINYI"},{"id":3657,"parentId":0,"regionName":"林芝","cityCode":542600,"pinYin":"LINZHI"},{"id":1039,"parentId":0,"regionName":"丽水","cityCode":331100,"pinYin":"LISHUI"},{"id":227,"parentId":0,"regionName":"六安","cityCode":341500,"pinYin":"LIUAN"},{"id":406,"parentId":0,"regionName":"六盘水","cityCode":520200,"pinYin":"LIUPANSHUI"},{"id":380,"parentId":0,"regionName":"柳州","cityCode":450200,"pinYin":"LIUZHOU"},{"id":288,"parentId":0,"regionName":"陇南","cityCode":621200,"pinYin":"LONGNAN"},{"id":263,"parentId":0,"regionName":"龙岩","cityCode":350800,"pinYin":"LONGYAN"},{"id":595,"parentId":0,"regionName":"娄底","cityCode":431300,"pinYin":"LOUDI"},{"id":5863,"parentId":0,"regionName":"陵水","cityCode":469028,"pinYin":"LS"},{"id":1194,"parentId":0,"regionName":"吕梁","cityCode":141100,"pinYin":"LULIANG"},{"id":495,"parentId":0,"regionName":"漯河","cityCode":411100,"pinYin":"LUOHE"},{"id":486,"parentId":0,"regionName":"洛阳","cityCode":410300,"pinYin":"LUOYANG"},{"id":959,"parentId":0,"regionName":"泸州","cityCode":510500,"pinYin":"LUZHOU"}],"M":[{"id":170,"parentId":0,"regionName":"马鞍山","cityCode":340500,"pinYin":"MAANSHAN"},{"id":348,"parentId":0,"regionName":"茂名","cityCode":440900,"pinYin":"MAOMING"},{"id":961,"parentId":0,"regionName":"眉山","cityCode":511400,"pinYin":"MEISHAN"},{"id":350,"parentId":0,"regionName":"梅州","cityCode":441400,"pinYin":"MEIZHOU"},{"id":944,"parentId":0,"regionName":"绵阳","cityCode":510700,"pinYin":"MIANYANG"},{"id":528,"parentId":0,"regionName":"牡丹江","cityCode":231000,"pinYin":"MUDANJIANG"}],"N":[{"id":738,"parentId":0,"regionName":"南昌","cityCode":360100,"pinYin":"NANCHANG"},{"id":968,"parentId":0,"regionName":"南充","cityCode":511300,"pinYin":"NANCHONG"},{"id":63,"parentId":0,"regionName":"南京","cityCode":320100,"pinYin":"NANJING"},{"id":372,"parentId":0,"regionName":"南宁","cityCode":450100,"pinYin":"NANNING"},{"id":254,"parentId":0,"regionName":"南平","cityCode":350700,"pinYin":"NANPING"},{"id":132,"parentId":0,"regionName":"南通","cityCode":320600,"pinYin":"NANTONG"},{"id":499,"parentId":0,"regionName":"南阳","cityCode":411300,"pinYin":"NANYANG"},{"id":970,"parentId":0,"regionName":"内江","cityCode":511000,"pinYin":"NEIJIANG"},{"id":147,"parentId":0,"regionName":"宁波","cityCode":330200,"pinYin":"NINGBO"},{"id":268,"parentId":0,"regionName":"宁德","cityCode":350900,"pinYin":"NINGDE"},{"id":3651,"parentId":0,"regionName":"怒江","cityCode":533300,"pinYin":"NUJIANG"}],"P":[{"id":784,"parentId":0,"regionName":"盘锦","cityCode":211100,"pinYin":"PANJIN"},{"id":951,"parentId":0,"regionName":"攀枝花","cityCode":510400,"pinYin":"PANZHIHUA"},{"id":502,"parentId":0,"regionName":"平顶山","cityCode":410400,"pinYin":"PINGDINGSHAN"},{"id":1137,"parentId":0,"regionName":"平凉","cityCode":620800,"pinYin":"PINGLIANG"},{"id":711,"parentId":0,"regionName":"萍乡","cityCode":360300,"pinYin":"PINGXIANG"},{"id":3198,"parentId":0,"regionName":"普洱","cityCode":530800,"pinYin":"PUER"},{"id":271,"parentId":0,"regionName":"莆田","cityCode":350300,"pinYin":"PUTIAN"},{"id":458,"parentId":0,"regionName":"濮阳","cityCode":410900,"pinYin":"PUYANG"}],"Q":[{"id":3647,"parentId":0,"regionName":"黔东南","cityCode":522600,"pinYin":"QIANDONGNAN"},{"id":1158,"parentId":0,"regionName":"潜江","cityCode":429005,"pinYin":"QIANJIANG"},{"id":3648,"parentId":0,"regionName":"黔南","cityCode":522700,"pinYin":"QIANNAN"},{"id":3646,"parentId":0,"regionName":"黔西南","cityCode":522300,"pinYin":"QIANXINAN"},{"id":51,"parentId":0,"regionName":"青岛","cityCode":370200,"pinYin":"QINGDAO"},{"id":3318,"parentId":0,"regionName":"庆阳","cityCode":621000,"pinYin":"QINGYANG"},{"id":102,"parentId":0,"regionName":"清远","cityCode":441800,"pinYin":"QINGYUAN"},{"id":446,"parentId":0,"regionName":"秦皇岛","cityCode":130300,"pinYin":"QINHUANGDAO"},{"id":1145,"parentId":0,"regionName":"钦州","cityCode":450700,"pinYin":"QINZHOU"},{"id":1124,"parentId":0,"regionName":"琼海","cityCode":469002,"pinYin":"QIONGHAI"},{"id":5851,"parentId":0,"regionName":"琼中","cityCode":469030,"pinYin":"QIONGZHONG"},{"id":530,"parentId":0,"regionName":"齐齐哈尔","cityCode":230200,"pinYin":"QIQIHAER"},{"id":3636,"parentId":0,"regionName":"七台河","cityCode":230900,"pinYin":"QITAIHE"},{"id":245,"parentId":0,"regionName":"泉州","cityCode":350500,"pinYin":"QUANZHOU"},{"id":1016,"parentId":0,"regionName":"曲靖","cityCode":530300,"pinYin":"QUJING"},{"id":145,"parentId":0,"regionName":"衢州","cityCode":330800,"pinYin":"QUZHOU"}],"R":[{"id":3654,"parentId":0,"regionName":"日喀则","cityCode":540200,"pinYin":"RIKEZE"},{"id":877,"parentId":0,"regionName":"日照","cityCode":371100,"pinYin":"RIZHAO"}],"S":[{"id":449,"parentId":0,"regionName":"三门峡","cityCode":411200,"pinYin":"SANMENXIA"},{"id":239,"parentId":0,"regionName":"三明","cityCode":350400,"pinYin":"SANMING"},{"id":410,"parentId":0,"regionName":"三亚","cityCode":460200,"pinYin":"SANYA"},{"id":1,"parentId":0,"regionName":"上海","cityCode":310100,"pinYin":"SHANGHAI"},{"id":897,"parentId":0,"regionName":"商洛","cityCode":611000,"pinYin":"SHANGLUO"},{"id":452,"parentId":0,"regionName":"商丘","cityCode":411400,"pinYin":"SHANGQIU"},{"id":713,"parentId":0,"regionName":"上饶","cityCode":361100,"pinYin":"SHANGRAO"},{"id":3653,"parentId":0,"regionName":"山南","cityCode":540500,"pinYin":"SHANNANSHI"},{"id":290,"parentId":0,"regionName":"汕头","cityCode":440500,"pinYin":"SHANTOU"},{"id":294,"parentId":0,"regionName":"汕尾","cityCode":441500,"pinYin":"SHANWEI"},{"id":296,"parentId":0,"regionName":"韶关","cityCode":440200,"pinYin":"SHAOGUAN"},{"id":66,"parentId":0,"regionName":"绍兴","cityCode":330600,"pinYin":"SHAOXING"},{"id":571,"parentId":0,"regionName":"邵阳","cityCode":430500,"pinYin":"SHAOYANG"},{"id":75,"parentId":0,"regionName":"沈阳","cityCode":210100,"pinYin":"SHENYANG"},{"id":28,"parentId":0,"regionName":"深圳","cityCode":440300,"pinYin":"SHENZHEN"},{"id":1200,"parentId":0,"regionName":"石河子","cityCode":659001,"pinYin":"SHIHEZI"},{"id":59,"parentId":0,"regionName":"石家庄","cityCode":130100,"pinYin":"SHIJIAZHUANG"},{"id":68,"parentId":0,"regionName":"十堰","cityCode":420300,"pinYin":"SHIYAN"},{"id":807,"parentId":0,"regionName":"石嘴山","cityCode":640200,"pinYin":"SHIZUISHAN"},{"id":3635,"parentId":0,"regionName":"双鸭山","cityCode":230500,"pinYin":"SHUANGYASHAN"},{"id":3629,"parentId":0,"regionName":"朔州","cityCode":140600,"pinYin":"SHUOZHOU"},{"id":621,"parentId":0,"regionName":"四平","cityCode":220300,"pinYin":"SIPING"},{"id":1174,"parentId":0,"regionName":"松原","cityCode":220700,"pinYin":"SONGYUAN"},{"id":511,"parentId":0,"regionName":"绥化","cityCode":231200,"pinYin":"SUIHUA"},{"id":922,"parentId":0,"regionName":"遂宁","cityCode":510900,"pinYin":"SUINING"},{"id":534,"parentId":0,"regionName":"随州","cityCode":421300,"pinYin":"SUIZHOU"},{"id":644,"parentId":0,"regionName":"宿迁","cityCode":321300,"pinYin":"SUQIAN"},{"id":193,"parentId":0,"regionName":"宿州","cityCode":341300,"pinYin":"SUZHOU"},{"id":107,"parentId":0,"regionName":"苏州","cityCode":320500,"pinYin":"SUZHOU"}],"T":[{"id":3674,"parentId":0,"regionName":"塔城","cityCode":654200,"pinYin":"TACHENG"},{"id":817,"parentId":0,"regionName":"泰安","cityCode":370900,"pinYin":"TAIAN"},{"id":81,"parentId":0,"regionName":"太原","cityCode":140100,"pinYin":"TAIYUAN"},{"id":181,"parentId":0,"regionName":"台州","cityCode":331000,"pinYin":"TAIZHOU"},{"id":640,"parentId":0,"regionName":"泰州","cityCode":321200,"pinYin":"TAIZHOU"},{"id":83,"parentId":0,"regionName":"唐山","cityCode":130200,"pinYin":"TANGSHAN"},{"id":22,"parentId":0,"regionName":"天津","cityCode":120100,"pinYin":"TIANJIN"},{"id":1159,"parentId":0,"regionName":"天门","cityCode":429006,"pinYin":"TIANMEN"},{"id":1119,"parentId":0,"regionName":"天水","cityCode":620500,"pinYin":"TIANSHUI"},{"id":1179,"parentId":0,"regionName":"铁岭","cityCode":211200,"pinYin":"TIELING"},{"id":1187,"parentId":0,"regionName":"铜川","cityCode":610200,"pinYin":"TONGCHUAN"},{"id":619,"parentId":0,"regionName":"通化","cityCode":220500,"pinYin":"TONGHUA"},{"id":787,"parentId":0,"regionName":"通辽","cityCode":150500,"pinYin":"TONGLIAO"},{"id":191,"parentId":0,"regionName":"铜陵","cityCode":340700,"pinYin":"TONGLING"},{"id":386,"parentId":0,"regionName":"铜仁","cityCode":522201,"pinYin":"TONGREN"}],"W":[{"id":5534,"parentId":0,"regionName":"万宁","cityCode":469006,"pinYin":"WANNING"},{"id":821,"parentId":0,"regionName":"潍坊","cityCode":370700,"pinYin":"WEIFANG"},{"id":853,"parentId":0,"regionName":"威海","cityCode":371000,"pinYin":"WEIHAI"},{"id":905,"parentId":0,"regionName":"渭南","cityCode":610500,"pinYin":"WEINAN"},{"id":5773,"parentId":0,"regionName":"文昌","cityCode":469005,"pinYin":"WENCHANG"},{"id":3269,"parentId":0,"regionName":"文山","cityCode":532600,"pinYin":"WENSHAN"},{"id":1047,"parentId":0,"regionName":"温州","cityCode":330300,"pinYin":"WENZHOU"},{"id":803,"parentId":0,"regionName":"乌海","cityCode":150300,"pinYin":"WUHAI"},{"id":10,"parentId":0,"regionName":"武汉","cityCode":420100,"pinYin":"WUHAN"},{"id":219,"parentId":0,"regionName":"芜湖","cityCode":340200,"pinYin":"WUHU"},{"id":5754,"parentId":0,"regionName":"五家渠","cityCode":659004,"pinYin":"WUJIAQU"},{"id":3630,"parentId":0,"regionName":"乌兰察布","cityCode":150900,"pinYin":"WULANCHABU"},{"id":987,"parentId":0,"regionName":"乌鲁木齐","cityCode":650100,"pinYin":"WULUMUQI"},{"id":284,"parentId":0,"regionName":"武威","cityCode":620600,"pinYin":"WUWEI"},{"id":151,"parentId":0,"regionName":"无锡","cityCode":320200,"pinYin":"WUXI"},{"id":3666,"parentId":0,"regionName":"吴忠","cityCode":640300,"pinYin":"WUZHONG"},{"id":374,"parentId":0,"regionName":"梧州","cityCode":450400,"pinYin":"WUZHOU"}],"X":[{"id":89,"parentId":0,"regionName":"厦门","cityCode":350200,"pinYin":"XIAMEN"},{"id":46,"parentId":0,"regionName":"西安","cityCode":610100,"pinYin":"XIAN"},{"id":599,"parentId":0,"regionName":"湘潭","cityCode":430300,"pinYin":"XIANGTAN"},{"id":602,"parentId":0,"regionName":"湘西","cityCode":433100,"pinYin":"XIANGXI"},{"id":731,"parentId":0,"regionName":"襄阳","cityCode":420600,"pinYin":"XIANGYANG"},{"id":538,"parentId":0,"regionName":"咸宁","cityCode":421200,"pinYin":"XIANNING"},{"id":569,"parentId":0,"regionName":"仙桃","cityCode":429004,"pinYin":"XIANTAO"},{"id":918,"parentId":0,"regionName":"咸阳","cityCode":610400,"pinYin":"XIANYANG"},{"id":1160,"parentId":0,"regionName":"孝感","cityCode":420900,"pinYin":"XIAOGAN"},{"id":3303,"parentId":0,"regionName":"锡林郭勒","cityCode":152500,"pinYin":"XILINGUOLE"},{"id":3631,"parentId":0,"regionName":"兴安盟","cityCode":152200,"pinYin":"XINGAN"},{"id":441,"parentId":0,"regionName":"邢台","cityCode":130500,"pinYin":"XINGTAI"},{"id":3679,"parentId":3646,"regionName":"兴义市","cityCode":522301,"pinYin":"XINGYI","selected":1},{"id":814,"parentId":0,"regionName":"西宁","cityCode":630100,"pinYin":"XINING"},{"id":472,"parentId":0,"regionName":"新乡","cityCode":410700,"pinYin":"XINXIANG"},{"id":470,"parentId":0,"regionName":"信阳","cityCode":411500,"pinYin":"XINYANG"},{"id":733,"parentId":0,"regionName":"新余","cityCode":360500,"pinYin":"XINYU"},{"id":3432,"parentId":0,"regionName":"忻州","cityCode":140900,"pinYin":"XINZHOU"},{"id":1010,"parentId":0,"regionName":"西双版纳","cityCode":532800,"pinYin":"XISHUANGBANNA"},{"id":224,"parentId":0,"regionName":"宣城","cityCode":341800,"pinYin":"XUANCHENG"},{"id":477,"parentId":0,"regionName":"许昌","cityCode":411000,"pinYin":"XUCHANG"},{"id":95,"parentId":0,"regionName":"徐州","cityCode":320300,"pinYin":"XUZHOU"}],"Y":[{"id":3438,"parentId":0,"regionName":"雅安","cityCode":511800,"pinYin":"YAAN"},{"id":912,"parentId":0,"regionName":"延安","cityCode":610600,"pinYin":"YANAN"},{"id":3634,"parentId":0,"regionName":"延边","cityCode":222400,"pinYin":"YANBIAN"},{"id":642,"parentId":0,"regionName":"盐城","cityCode":320900,"pinYin":"YANCHENG"},{"id":329,"parentId":0,"regionName":"阳江","cityCode":441700,"pinYin":"YANGJIANG"},{"id":5750,"parentId":0,"regionName":"洋浦","cityCode":469000,"pinYin":"YANGPU"},{"id":1195,"parentId":0,"regionName":"阳泉","cityCode":140300,"pinYin":"YANGQUAN"},{"id":660,"parentId":0,"regionName":"扬州","cityCode":321000,"pinYin":"YANGZHOU"},{"id":105,"parentId":0,"regionName":"烟台","cityCode":370600,"pinYin":"YANTAI"},{"id":949,"parentId":0,"regionName":"宜宾","cityCode":511500,"pinYin":"YIBIN"},{"id":565,"parentId":0,"regionName":"宜昌","cityCode":420500,"pinYin":"YICHANG"},{"id":3463,"parentId":0,"regionName":"伊春","cityCode":230700,"pinYin":"YICHUN"},{"id":716,"parentId":0,"regionName":"宜春","cityCode":360900,"pinYin":"YICHUN"},{"id":1104,"parentId":0,"regionName":"伊犁","cityCode":654000,"pinYin":"YILI"},{"id":810,"parentId":0,"regionName":"银川","cityCode":640100,"pinYin":"YINCHUAN"},{"id":774,"parentId":0,"regionName":"营口","cityCode":210800,"pinYin":"YINGKOU"},{"id":1170,"parentId":0,"regionName":"鹰潭","cityCode":360600,"pinYin":"YINGTAN"},{"id":4636,"parentId":151,"regionName":"宜兴市","cityCode":320282,"pinYin":"YIXINGSHI","selected":1},{"id":605,"parentId":0,"regionName":"益阳","cityCode":430900,"pinYin":"YIYANG"},{"id":1164,"parentId":0,"regionName":"永州","cityCode":431100,"pinYin":"YONGZHOU"},{"id":607,"parentId":0,"regionName":"岳阳","cityCode":430600,"pinYin":"YUEYANG"},{"id":378,"parentId":0,"regionName":"玉林","cityCode":450900,"pinYin":"YULIN"},{"id":914,"parentId":0,"regionName":"榆林","cityCode":610800,"pinYin":"YULIN"},{"id":888,"parentId":0,"regionName":"运城","cityCode":140800,"pinYin":"YUNCHENG"},{"id":332,"parentId":0,"regionName":"云浮","cityCode":445300,"pinYin":"YUNFU"},{"id":3664,"parentId":0,"regionName":"玉树","cityCode":632700,"pinYin":"YUSHU"},{"id":1012,"parentId":0,"regionName":"玉溪","cityCode":530400,"pinYin":"YUXI"}],"Z":[{"id":857,"parentId":0,"regionName":"枣庄","cityCode":370400,"pinYin":"ZAOZHUANG"},{"id":1236,"parentId":0,"regionName":"张家界","cityCode":430800,"pinYin":"ZHANGGUJIE"},{"id":443,"parentId":0,"regionName":"张家口","cityCode":130700,"pinYin":"ZHANGJIAKOU"},{"id":286,"parentId":0,"regionName":"张掖","cityCode":620700,"pinYin":"ZHANGYE"},{"id":243,"parentId":0,"regionName":"漳州","cityCode":350600,"pinYin":"ZHANGZHOU"},{"id":334,"parentId":0,"regionName":"湛江","cityCode":440800,"pinYin":"ZHANJIANG"},{"id":337,"parentId":0,"regionName":"肇庆","cityCode":441200,"pinYin":"ZHAOQING"},{"id":3649,"parentId":0,"regionName":"昭通","cityCode":530600,"pinYin":"ZHAOTONG"},{"id":43,"parentId":0,"regionName":"郑州","cityCode":410100,"pinYin":"ZHENGZHOU"},{"id":657,"parentId":0,"regionName":"镇江","cityCode":321100,"pinYin":"ZHENJIANG"},{"id":339,"parentId":0,"regionName":"中山","cityCode":442000,"pinYin":"ZHONGSHAN"},{"id":1184,"parentId":0,"regionName":"中卫","cityCode":640500,"pinYin":"ZHONGWEI"},{"id":93,"parentId":0,"regionName":"周口","cityCode":411600,"pinYin":"ZHOUKOU"},{"id":1055,"parentId":0,"regionName":"舟山","cityCode":330900,"pinYin":"ZHOUSHAN"},{"id":346,"parentId":0,"regionName":"珠海","cityCode":440400,"pinYin":"ZHUHAI"},{"id":484,"parentId":0,"regionName":"驻马店","cityCode":411700,"pinYin":"ZHUMADIAN"},{"id":597,"parentId":0,"regionName":"株洲","cityCode":430200,"pinYin":"ZHUZHOU"},{"id":860,"parentId":0,"regionName":"淄博","cityCode":370300,"pinYin":"ZIBO"},{"id":955,"parentId":0,"regionName":"自贡","cityCode":510300,"pinYin":"ZIGONG"},{"id":957,"parentId":0,"regionName":"资阳","cityCode":512000,"pinYin":"ZIYANG"},{"id":403,"parentId":0,"regionName":"遵义","cityCode":520300,"pinYin":"ZUNYI"}]}}

24、爬虫_解析_jsonpath解析淘票票

import urllib.request

url ='https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers ={# ':authority': 'dianying.taobao.com',# ':method': 'GET',# ':path': '/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',# ':scheme': 'https','accept':'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    # 'accept-encoding': 'gzip, deflate, br','accept-language':'zh-CN,zh;q=0.9',
    'cookie':'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9',
    'referer':'https://dianying.taobao.com/',
    'sec-ch-ua':'"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
    'sec-ch-ua-mobile':'?0',
    'sec-fetch-dest':'empty',
    'sec-fetch-mode':'cors',
    'sec-fetch-site':'same-origin',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'x-requested-with':'XMLHttpRequest',
}

request = urllib.request.Request(url = url, headers = headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')# split 切割
content = content.split('(')[1].split(')')[0]

with open('爬虫_解析_jsonpath解析淘票票.json','w',encoding='utf-8')as fp:
    fp.write(content)import json
import jsonpath

obj = json.load(open('爬虫_解析_jsonpath解析淘票票.json','r',encoding='utf-8'))

city_list = jsonpath.jsonpath(obj,'$..regionName')

print(city_list)

25、爬虫_解析_bs4的基本使用

<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body><div><ul><li id="l1">张三</li><li id="l2">李四</li><li>王五</li><a href=""id=""class="a1">123</a><span>嘿嘿嘿</span></ul></div><a href=""title="a2">百度</a><div id="d1"><span>
            哈哈哈
        </span></div><p id="p1"class="p1">呵呵呵</p></body></html>

26、爬虫_解析_bs4的基本使用


from bs4 import BeautifulSoup

# 通过解析本地文件 来将bs4的基础语法进行讲解# 默认打开的文件的编码格式是gbk 所以在打开文件的时候需要指定编码
soup = BeautifulSoup(open('爬虫_解析_bs4的基本使用.html',encoding='utf-8'),'lxml')# 根据标签名查找节点# 找到的是第一个符合条件的数据# print(soup.a)# 获取标签的属性和属性值# print(soup.a.attrs)# bs4的一些函数# (1)find# 返回的是第一个符合条件的数据# print(soup.find('a'))# 根据title的值来找到对应的标签对象# print(soup.find('a',title="a2"))# 根据class的值来找到对应的标签对象  注意的是class需要添加下划线# print(soup.find('a',class_="a1"))# (2)find_all  返回的是一个列表 并且返回了所有的a标签# print(soup.find_all('a'))# 如果想获取的是多个标签的数据 那么需要在find_all的参数中添加的是列表的数据# print(soup.find_all(['a','span']))# limit的作用是查找前几个数据# print(soup.find_all('li',limit=2))# (3)select(推荐)# select方法返回的是一个列表  并且会返回多个数据# print(soup.select('a'))# 可以通过.代表class  我们把这种操作叫做类选择器# print(soup.select('.a1'))# print(soup.select('#l1'))# 属性选择器---通过属性来寻找对应的标签# 查找到li标签中有id的标签# print(soup.select('li[id]'))# 查找到li标签中id为l2的标签# print(soup.select('li[id="l2"]'))# 层级选择器#  后代选择器# 找到的是div下面的li# print(soup.select('div li'))# 子代选择器#  某标签的第一级子标签# 注意:很多的计算机编程语言中 如果不加空格不会输出内容  但是在bs4中 不会报错 会显示内容# print(soup.select('div > ul > li'))# 找到a标签和li标签的所有的对象# print(soup.select('a,li'))# 节点信息#    获取节点内容# obj = soup.select('#d1')[0]# 如果标签对象中 只有内容 那么string和get_text()都可以使用# 如果标签对象中 除了内容还有标签 那么string就获取不到数据 而get_text()是可以获取数据# 我们一般情况下  推荐使用get_text()# print(obj.string)# print(obj.get_text())# 节点的属性# obj = soup.select('#p1')[0]# name是标签的名字# print(obj.name)# 将属性值左右一个字典返回# print(obj.attrs)# 获取节点的属性
obj = soup.select('#p1')[0]

print(obj.attrs.get('class'))
print(obj.get('class'))
print(obj['class'])

27、爬虫_解析_bs4爬取星巴克数据

import urllib.request

url ='https://www.starbucks.com.cn/menu/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')

from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')# //ul[@class="grid padded-3 product"]//strong/text()
name_list = soup.select('ul[class="grid padded-3 product"] strong')fornamein name_list:
    print(name.get_text())

28、爬虫_selenium_为什么要学习selenium

import urllib.request

url ='https://www.jd.com/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')

print(content)

29、爬虫_selenium_基本使用

# (1)导入selenium
from selenium import webdriver

# (2) 创建浏览器操作对象

path ='chromedriver.exe'

browser = webdriver.Chrome(path)# (3)访问网站# url = 'https://www.baidu.com'## browser.get(url)

url ='https://www.jd.com/'

browser.get(url)# page_source获取网页源码
content = browser.page_source
print(content)

30、爬虫_selenium_元素定位


from selenium import webdriver

path ='chromedriver.exe'
browser = webdriver.Chrome(path)

url ='https://www.baidu.com'
browser.get(url)# 元素定位# 根据id来找到对象# button = browser.find_element_by_id('su')# print(button)# 根据标签属性的属性值来获取对象的# button = browser.find_element_by_name('wd')# print(button)# 根据xpath语句来获取对象# button = browser.find_elements_by_xpath('//input[@id="su"]')# print(button)# 根据标签的名字来获取对象# button = browser.find_elements_by_tag_name('input')# print(button)# 使用的bs4的语法来获取对象# button = browser.find_elements_by_css_selector('#su')# print(button)# button = browser.find_element_by_link_text('直播')# print(button)

31、爬虫_selenium_元素信息


from selenium import webdriver

path ='chromedriver.exe'
browser = webdriver.Chrome(path)

url ='http://www.baidu.com'
browser.get(url)

input = browser.find_element_by_id('su')# 获取标签的属性
print(input.get_attribute('class'))# 获取标签的名字
print(input.tag_name)# 获取元素文本
a = browser.find_element_by_link_text('新闻')
print(a.text)

32、爬虫_selenium_交互


from selenium import webdriver

# 创建浏览器对象
path ='chromedriver.exe'
browser = webdriver.Chrome(path)# url
url ='https://www.baidu.com'
browser.get(url)importtime
time.sleep(2)# 获取文本框的对象
input = browser.find_element_by_id('kw')# 在文本框中输入周杰伦
input.send_keys('周杰伦')

time.sleep(2)# 获取百度一下的按钮
button = browser.find_element_by_id('su')# 点击按钮
button.click()

time.sleep(2)# 滑到底部
js_bottom ='document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)

time.sleep(2)# 获取下一页的按钮
next = browser.find_element_by_xpath('//a[@class="n"]')# 点击下一页
next.click()

time.sleep(2)# 回到上一页
browser.back()

time.sleep(2)# 回去
browser.forward()

time.sleep(3)# 退出
browser.quit()

33、爬虫_selenium_phantomjs的基本使用


from selenium import webdriver

path ='phantomjs.exe'

browser = webdriver.PhantomJS(path)

url ='https://www.baidu.com'
browser.get(url)

browser.save_screenshot('baidu.png')importtime
time.sleep(2)

input = browser.find_element_by_id('kw')
input.send_keys('昆凌')

time.sleep(3)

browser.save_screenshot('kunling.png')

34、爬虫_selenium_handless

# from selenium import webdriver# from selenium.webdriver.chrome.options import Options## chrome_options = Options()# chrome_options.add_argument('--headless')# chrome_options.add_argument('--disable-gpu')## # path是你自己的chrome浏览器的文件路径# path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'# chrome_options.binary_location = path## browser = webdriver.Chrome(chrome_options=chrome_options)### url = 'https://www.baidu.com'## browser.get(url)## browser.save_screenshot('baidu.png')# 封装的handless

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def share_browser():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')# path是你自己的chrome浏览器的文件路径
    path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
    chrome_options.binary_location = path

    browser = webdriver.Chrome(chrome_options=chrome_options)return browser

browser = share_browser()

url ='https://www.baidu.com'

browser.get(url)

35、爬虫_requests_基本使用

import requests

url ='http://www.baidu.com'

response = requests.get(url=url)# 一个类型和六个属性# Response类型# print(type(response))# 设置响应的编码格式# response.encoding = 'utf-8'# 以字符串的形式来返回了网页的源码# print(response.text)# 返回一个url地址# print(response.url)# 返回的是二进制的数据# print(response.content)# 返回响应的状态码# print(response.status_code)# 返回的是响应头
print(response.headers)

36、爬虫_requests_get请求

# urllib# (1) 一个类型以及六个方法# (2)get请求# (3)post请求   百度翻译# (4)ajax的get请求# (5)ajax的post请求# (6)cookie登陆 微博# (7)代理# requests# (1)一个类型以及六个属性# (2)get请求# (3)post请求# (4)代理# (5)cookie  验证码import requests

url ='https://www.baidu.com/s'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

data ={'wd':'北京'}# url  请求资源路径# params 参数# kwargs 字典
response = requests.get(url=url,params=data,headers=headers)

content = response.text

print(content)# 总结:# (1)参数使用params传递# (2)参数无需urlencode编码# (3)不需要请求对象的定制# (4)请求资源路径中的?可以加也可以不加

37、爬虫_requests_post请求

import requests

url ='https://fanyi.baidu.com/sug'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}

data ={'kw':'eye'}# url 请求地址# data 请求参数# kwargs 字典
response = requests.post(url=url,data=data,headers=headers)

content =response.text

import json

obj = json.loads(content,encoding='utf-8')
print(obj)# 总结:# (1)post请求 是不需要编解码# (2)post请求的参数是data# (3)不需要请求对象的定制

38、爬虫_requests_代理

import requests

url ='http://www.baidu.com/s?'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}

data ={'wd':'ip'}

proxy ={'http':'212.129.251.55:16816'}

response = requests.get(url = url,params=data,headers = headers,proxies = proxy)

content = response.text

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

39、爬虫_requests_cookie登陆古诗文网

# 通过登陆  然后进入到主页面# 通过找登陆接口我们发现 登陆的时候需要的参数很多# _VIEWSTATE: /m1O5dxmOo7f1qlmvtnyNyhhaUrWNVTs3TMKIsm1lvpIgs0WWWUCQHl5iMrvLlwnsqLUN6Wh1aNpitc4WnOt0So3k6UYdFyqCPI6jWSvC8yBA1Q39I7uuR4NjGo=# __VIEWSTATEGENERATOR: C93BE1AE# from: http://so.gushiwen.cn/user/collect.aspx# email: [email protected]# pwd: action# code: PId7# denglu: 登录# 我们观察到_VIEWSTATE   __VIEWSTATEGENERATOR  code是一个可以变化的量# 难点:(1)_VIEWSTATE   __VIEWSTATEGENERATOR  一般情况看不到的数据 都是在页面的源码中#     我们观察到这两个数据在页面的源码中 所以我们需要获取页面的源码 然后进行解析就可以获取了#     (2)验证码import requests

# 这是登陆页面的url地址
url ='https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'

headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}# 获取页面的源码
response = requests.get(url = url,headers = headers)
content = response.text

# 解析页面源码  然后获取_VIEWSTATE   __VIEWSTATEGENERATOR
from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')# 获取_VIEWSTATE
viewstate = soup.select('#__VIEWSTATE')[0].attrs.get('value')# 获取__VIEWSTATEGENERATOR
viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')# 获取验证码图片
code = soup.select('#imgCode')[0].attrs.get('src')
code_url ='https://so.gushiwen.cn' + code

# 有坑# import urllib.request# urllib.request.urlretrieve(url=code_url,filename='code.jpg')# requests里面有一个方法 session()  通过session的返回值 就能使用请求变成一个对象

session = requests.session()# 验证码的url的内容
response_code = session.get(code_url)# 注意此时要使用二进制数据  因为我们要使用的是图片的下载
content_code = response_code.content
# wb的模式就是将二进制数据写入到文件
with open('code.jpg','wb')as fp:
    fp.write(content_code)# 获取了验证码的图片之后 下载到本地 然后观察验证码  观察之后 然后在控制台输入这个验证码 就可以将这个值给# code的参数 就可以登陆

code_name = input('请输入你的验证码')# 点击登陆
url_post ='https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'

data_post ={'__VIEWSTATE': viewstate,
    '__VIEWSTATEGENERATOR': viewstategenerator,
    'from':'http://so.gushiwen.cn/user/collect.aspx',
    'email':'[email protected]',
    'pwd':'.....',
    'code': code_name,
    'denglu':'登录',
}

response_post = session.post(url = url, headers = headers, data = data_post)

content_post = response_post.text

with open('gushiwen.html','w',encoding=' utf-8')as fp:
    fp.write(content_post)# 难点# (1) 隐藏域# (2) 验证码

40、爬虫_requests_超级鹰打码平台的使用

#!/usr/bin/env python# coding:utf-8import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params ={'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers ={'Connection':'Keep-Alive',
            'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params ={'codetype': codetype,
        }
        params.update(self.base_params)
        files ={'userfile':('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params ={'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)return r.json()if __name__ =='__main__':
    chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001')#用户中心>>软件ID 生成一个替换 96001
    im = open('a.jpg', 'rb').read()#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))#1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

41、爬虫_scrapy_安装

# (1) pip install scrapy# (2) 报错1: building 'twisted.test.raiser' extension#              error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++#              Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools#     解决1#       http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted#       Twisted‑20.3.0‑cp37‑cp37m‑win_amd64.whl#       cp是你的python版本#       amd是你的操作系统的版本#       下载完成之后 使用pip install twisted的路径  安装#       切记安装完twisted 再次安装scrapy# (3) 报错2  提示python -m pip install --upgrade pip#      解决2   运行python -m pip install --upgrade pip# (4) 报错3   win32的错误#      解决3   pip install pypiwin32# (5) anaconda

42、爬虫_scrapy_scrapyshell

# 进入到scrapy shell的终端  直接在window的终端中输入scrapy shell 域名# 如果想看到一些高亮 或者 自动补全  那么可以安装ipython  pip install ipython# scrapy shell www.baidu.com

本文转载自: https://blog.csdn.net/qq_42740465/article/details/129432188
版权归原作者 qq_繁华 所有, 如有侵权,请联系我们删除。

“python 爬虫代码”的评论:

还没有评论