0


爬虫实战总结

总结一下爬虫学习中的实例实战

1.金山翻译

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. # 获取翻译包的url,需要去掉多余的保护壳:
  4. # https://ifanyi.iciba.com/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=9X%2BHAviAKqteMMuVvr%2B0X9RriqVIAJSQ%2BxmfU0q7dIE%3D
  5. url = 'https://ifanyi.iciba.com/index.php?c=trans'
  6. # 构建请求头
  7. headers = {
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
  9. 'Referer': 'https://www.iciba.com/',
  10. 'Host': 'ifanyi.iciba.com'
  11. }
  12. while True:
  13. # 实现用户输入的功能
  14. content = input('请输入您想翻译的内容(输入"exit"结束程序):')
  15. # 检查是否需要退出
  16. if content.lower() == 'exit':
  17. break
  18. # 构建参数字典
  19. post_data = {
  20. 'from': 'auto',
  21. 'to': 'auto',
  22. 'q': content,
  23. }
  24. # 发送请求
  25. res = requests.post(url, headers=headers, data=post_data)
  26. res_1 = res.content.decode()
  27. # 输出翻译结果
  28. print(eval(res_1)['out'])

2.github模拟登录

  1. # -*- coding: utf-8 -*-
  2. import re
  3. # 1.获取并模拟登录操作 2.保存登录会话信息 3.验证是否登录成功
  4. import requests
  5. from requests import Session
  6. def do_auth_token(session: Session):
  7. global response
  8. response = session.get('https://github.com/login')
  9. if response.status_code != 200:
  10. print("请求失败,请稍后再试!")
  11. exit(0)
  12. login_html = response.content.decode()
  13. auth_token = re.findall(r'name="authenticity_token" value="(.*?)"', login_html)[0]
  14. return auth_token
  15. def do_auth_login(session: Session):
  16. post_data = {
  17. "commit": "Sign in",
  18. "authenticity_token": auth_token,
  19. "login": "2834438515@qq.com",
  20. "password": "991016csq", # 登录密码,为了个人账号安全我这里不是真实密码
  21. "webauthn-conditional": "undefined",
  22. "javascript-support": "true",
  23. "webauthn-support": "supported",
  24. "webauthn-iuvpaa-support": "unsupported",
  25. "return_to": "https://github.com/login"
  26. }
  27. response = session.post(url='https://github.com/session', data=post_data)
  28. if response.status_code != 200:
  29. print("请求失败,请检查参数!")
  30. else:
  31. print("请求session 成功!")
  32. def do_login_status(session: Session):
  33. response = session.get('https://github.com/csqting')
  34. html_content = response.content
  35. response1 = re.findall(r'<title>(.+?)(GitHub)?</title>', html_content.decode('utf-8'))
  36. try:
  37. end_str = response1[0][1]
  38. except IndexError:
  39. end_str = ""
  40. if end_str == "":
  41. # 个人主页的title内容如果结尾没有GitHub,说明登录成功
  42. print("登录成功!")
  43. else:
  44. print("登录失败!")
  45. with open("github_profile.html", "wb") as f:
  46. f.write(html_content)
  47. if __name__ == '__main__':
  48. # 使用session进行状态保持
  49. session = requests.session()
  50. session.headers = {
  51. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
  52. }
  53. # 1. 获取并模拟登录操作
  54. auth_token = do_auth_token(session)
  55. # 2. 保存登录会话信息
  56. do_auth_login(session)
  57. # 3. 验证是否登录成功
  58. do_login_status(session)

3.百度贴吧爬取

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. from lxml import etree
  4. # url
  5. # headers
  6. # 发送请求获取响应
  7. # 从响应中提取数据
  8. # 判断结束
  9. class Tieba(object):
  10. def __init__(self, name):
  11. self.url = "https://tieba.baidu.com/f?kw={}".format(name)
  12. print(self.url)
  13. self.headers = {
  14. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
  15. # "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T132461)"
  16. }
  17. def get_data(self, url):
  18. response = requests.get(url, headers=self.headers)
  19. with open("temp.html", "wb") as f:
  20. f.write(response.content)
  21. return response.content
  22. def parse_data(self, data):
  23. # 创建element对象
  24. data = data.decode().replace("<!--", "").replace("-->", "")
  25. html = etree.HTML(data)
  26. el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
  27. # print(len(el_list))
  28. data_list = []
  29. for el in el_list:
  30. temp = {}
  31. temp['title'] = el.xpath('./text()')[0]
  32. temp['link'] = 'https://tieba.baidu.com' + el.xpath('./@href')[0]
  33. data_list.append(temp)
  34. # 获取下一页url
  35. try:
  36. next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
  37. except:
  38. next_url = None
  39. return data_list, next_url
  40. def save_data(self, data_list):
  41. for data in data_list:
  42. print(data)
  43. def run(self):
  44. next_url = self.url
  45. while True:
  46. # 发送请求获取响应
  47. data = self.get_data(next_url)
  48. # 从响应中提取数据,数据和翻页用的url
  49. data_list, next_url = self.parse_data(data)
  50. self.save_data(data_list)
  51. print(next_url)
  52. # 判断是否结束
  53. if next_url == None:
  54. break
  55. if __name__ == '__main__':
  56. tieba = Tieba("美食天下")
  57. tieba.run()

4.斗鱼直播

  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.by import By
  4. import time
  5. class Douyu(object):
  6. def __init__(self):
  7. self.url = 'https://www.douyu.com/directory/all'
  8. self.driver = webdriver.Chrome()
  9. self.driver.implicitly_wait(10) # 设置隐式等待,最大等待10秒
  10. def parse_data(self):
  11. room_list = self.driver.find_elements(By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
  12. print(len(room_list))
  13. data_list = []
  14. # 遍历房间列表,从每一个房间节点获取数据
  15. for room in room_list:
  16. temp = {}
  17. # temp['title'] = room.find_element(By.XPATH, './div[2]/div[1]/a').text
  18. # temp['type'] = room.find_element(By.XPATH, './div[2]/div[2]/span/a').text
  19. # temp['owner'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[1]/div').text
  20. # temp['num'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[2]/span').text
  21. temp['picture'] = room.find_element(By.XPATH, './div[1]/picture/source[1]').get_attribute('srcset')
  22. # print(temp)
  23. data_list.append(temp)
  24. return data_list
  25. def run(self):
  26. self.driver.get(self.url)
  27. total_rooms = 0
  28. last_count = 0 # 上一次获取的房间数量
  29. while True:
  30. # 滚动到页面底部
  31. self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
  32. time.sleep(2) # 等待页面加载新内容
  33. # 获取当前房间数据
  34. new_data = self.parse_data()
  35. total_rooms += len(new_data)
  36. print(f"Total rooms : {total_rooms}")
  37. # 检查当前房间数量
  38. if total_rooms == last_count: # 如果新加载的房间数量没有增加,停止滚动
  39. print("No more new data to load.")
  40. break
  41. last_count = total_rooms # 更新最后一次的房间数量
  42. print(f"Final total rooms fetched: {total_rooms}")
  43. self.driver.quit() # 退出浏览器
  44. if __name__ == '__main__':
  45. douyu = Douyu()
  46. douyu.run()

5.黑马贴吧

  1. import requests
  2. import re
  3. def fetch_page(url):
  4. headers = {
  5. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
  6. }
  7. response = requests.get(url, headers=headers)
  8. if response.status_code == 200:
  9. # 使用正则表达式提取文章标题
  10. titles = re.findall(r'class="s xst">([^<]+)</a>', response.text)
  11. # 提取发布时间和作者
  12. details = re.findall(
  13. r'<span style="margin-left: 0;">([^<]+)</span></a><span style="margin-left: 5px;">@ ([^<]+)</span>',
  14. response.text)
  15. authors = [detail[0] for detail in details]
  16. dates = [detail[1] for detail in details]
  17. # 输出提取的结果
  18. for title, date, author in zip(titles, dates, authors):
  19. print(f"文章标题: {title}")
  20. print(f"发布时间: {date}")
  21. print(f"文章作者: {author}")
  22. print('-' * 40)
  23. # 使用正则表达式提取下一页的链接,search第一次出现
  24. next_page_link = re.search(r'<a href="([^"]+)" class="nxt">下一页</a>', response.text)
  25. if next_page_link:
  26. return next_page_link.group(1) # 返回完整的链接
  27. else:
  28. return None
  29. else:
  30. print("访问失败", response.status_code)
  31. return None
  32. # 初始页面
  33. current_url = 'https://bbs.itheima.com/forum-425-1.html'
  34. # 循环遍历每一页,直到没有下一页
  35. while current_url:
  36. print(f"正在爬取: {current_url}")
  37. next_url = fetch_page(current_url)
  38. current_url = next_url

6.网易云

  1. # -*- coding: utf-8 -*-
  2. # document.charset 查看源码编码格式
  3. import requests
  4. import time
  5. import re
  6. import os
  7. filename = 'musics\\'
  8. # 如果没有则创建文件夹,os与操作系统实现交互功能(创建文件夹和目录)
  9. if not os.path.exists(filename):
  10. os.makedirs(filename)
  11. url = 'https://music.163.com/discover/toplist?id=3778678'
  12. headers = {
  13. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
  14. response = requests.get(url, headers=headers)
  15. time.sleep(5)
  16. # re.findall
  17. # 这个函数用于在字符串中查找所有与正则表达式模式匹配的部分,并返回一个包含所有匹配项的列表
  18. # r 前缀表示这是一个原始字符串,其中的反斜杠不会被解释为转义字符
  19. # (\d+): 捕获组,匹配一个或多个数字
  20. # (.*?): 捕获组,非贪婪匹配任何字符(包括空字符),直到遇到 </a>
  21. # print(response.text)
  22. html_data = re.findall(r'<li><a href="/song\?id=(\d+)">(.*?)</a>', response.text)
  23. for num_id, title in html_data:
  24. # f-string 直接嵌入表达式
  25. music_download = f'https://music.163.com/song/media/outer/url?id={num_id}.mp3'
  26. music_content = requests.get(music_download, headers=headers)
  27. with open('musics\\' + title + '.mp3', 'wb') as f:
  28. f.write(music_content.content)
  29. print(num_id, title)

7.微博热榜

  1. # # -*- coding: utf-8 -*-
  2. # import time
  3. # from lxml import etree
  4. # import requests
  5. #
  6. # url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
  7. # headers = {
  8. # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
  9. # response = requests.get(url, headers=headers)
  10. # time.sleep(3)
  11. # print(response.text)
  12. # html = etree.HTML(response.text)
  13. # el_list = html.xpath('//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]/text()')
  14. # print(len(el_list))
  15. import time
  16. from selenium import webdriver
  17. from selenium.webdriver.common.by import By
  18. url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
  19. driver = webdriver.Chrome()
  20. driver.get(url)
  21. time.sleep(3)
  22. el_list = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]')
  23. # print(len(el_list))
  24. el_list1 = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[2]')
  25. # print(len(el_list1))
  26. # save_out = []
  27. i=1
  28. for title,hot in zip(el_list,el_list1):
  29. # save_out.append(f"{i}\n") # 添加行号
  30. # save_out.append(f"文章标题: {title.text}\n") # 添加文章标题
  31. # save_out.append(f"热度: {hot.text}\n") # 添加热度
  32. print(f"{i}")
  33. print(f"文章标题: {title.text}")
  34. print(f"热度: {hot.text}")
  35. i += 1
  36. print('-' * 40)
  37. # with open("weibo.txt","w") as file:
  38. # file.writelines(save_out)
  39. driver.quit()

8.驾校自动答题

  1. # -*- coding: utf-8 -*-
  2. import time
  3. from selenium import webdriver
  4. from selenium.webdriver.common.by import By
  5. url = 'https://www.jsyks.com/kmy-mnks'
  6. driver = webdriver.Chrome()
  7. driver.get(url)
  8. # 1.获取答案xpath
  9. # 2.替换匹配答案
  10. # 3.执行滑动点击操作
  11. time.sleep(3)
  12. el_list = driver.find_elements(By.XPATH, '/html/body/div[4]/div[1]/div[1]/ul/li')
  13. # print(len(el_list))
  14. # 使用get_attribute('标签名')获取标签值,保存正确选项
  15. k_values = []
  16. for li in el_list:
  17. k_values.append(li.get_attribute('k')) # 'E'表示错误,'R'表示正确
  18. # 使用列表推导式替换字符,E R都不是保持原样字符
  19. replaced_list = ["正确" if x == 'R' else "错误" if x == 'E' else x for x in k_values]
  20. for index, li in enumerate(el_list):
  21. answer = replaced_list[index]
  22. if answer == '正确' or answer == '错误':
  23. option = li.find_element(By.XPATH, f".//b[contains(text(),'{answer}')]")
  24. else:
  25. # 使用 starts-with 函数查找以特定字符开始的文本,答案为A、B、C、D的情况
  26. option = li.find_element(By.XPATH, f".//b[starts-with(normalize-space(text()), '{answer}')]")
  27. # 滚动到指定元素
  28. driver.execute_script('arguments[0].scrollIntoView();', option)
  29. # 使用JavaScript点击选项
  30. driver.execute_script("arguments[0].click();", option)

后期学习路线:继续在实战中总结反爬手段,学习反调式,以及之后的爬虫完整项目学习。

标签: 爬虫

本文转载自: https://blog.csdn.net/2301_77869606/article/details/142965852
版权归原作者 菜鸡中的奋斗鸡→挣扎鸡 所有, 如有侵权,请联系我们删除。

“爬虫实战总结”的评论:

还没有评论