0


selenium案例——爬取哔哩哔哩排行榜

案例需求:

1.使用selenium自动化爬虫爬取哔哩哔哩排行榜中舞蹈类的数据(包括视频标题、up主、播放量和评论量)

2.利用bs4进行数据解析和提取

3.将爬取的数据保存在本地json文件中

4.保存在excel文件中

分析:

1.请求url地址:https://www.bilibili.com/v/popular/rank/dance

b6b20cf86cd1420faabfbda447086cd3.png

2.加载等待事件,否则获取数据不充分

  1. wait = WebDriverWait(self.browsers, 280)
  2. wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rank-item')))
  3. time.sleep(5)

3.获取相应内容

  1. last_height = self.browsers.execute_script("return document.body.scrollHeight")
  2. while True:
  3. self.browsers.execute_script('window.scrollTo(0, document.body.scrollHeight);')
  4. time.sleep(5)
  5. data = self.browsers.page_source # 获取网页源码
  6. self.parse_data(data=data)
  7. new_height = self.browsers.execute_script("return document.body.scrollHeight")
  8. if new_height == last_height:
  9. break
  10. last_height = new_height

4.使用bs4解析数据

  1. soup = BeautifulSoup(data, 'lxml')
  2. titles = soup.select('.info .title') # 标题
  3. up_names = soup.select('.info .up-name') # up主
  4. # :nth-of-type(2) 用于选择指定类型的第二个元素
  5. play_counts = soup.select('.info .detail-state .data-box:nth-of-type(1)') # 播放量
  6. comment_counts = soup.select('.info .detail-state .data-box:nth-of-type(2)') # 评论量
  7. rank_data = {}
  8. print(len(titles))
  9. for title, name, play_count, comment_count in zip(titles, up_names, play_counts, comment_counts):
  10. t = title.get_text().strip()
  11. n = name.get_text().strip()
  12. p = play_count.get_text().strip()
  13. c = comment_count.get_text().strip()
  14. print('标题:', t)
  15. print('up主:', n)
  16. print('播放量:', p)
  17. print('评论量:', c)
  18. print('==========================')

5.保存在本地json文件中

  1. with open('rank_data.json', 'a', encoding='utf-8') as f:
  2. f.write(json.dumps(rank_data, ensure_ascii=False) + '\n')

6.保存在excel文件中

  1. wb =workbook.Workbook()#创建一个EXcel对象 就相当于是要生成一个excel 程序
  2. ws = wb.active #激活当前表
  3. ws.append(['标题','up主','播放量','评论量'])
  1. #保存数据
  2. def save_data(self,title,name,paly,comment):
  3. ws.append([title,name,paly,comment])
  4. # 保存为Excel数据
  5. wb.save('哔哩哔哩排行榜数据.xlsx')

案例代码:

  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.support.wait import WebDriverWait
  4. from selenium.webdriver.support import expected_conditions as EC
  5. from bs4 import BeautifulSoup
  6. from openpyxl import workbook #第三方模块 需要安装
  7. import time
  8. import json
  9. wb =workbook.Workbook()#创建一个EXcel对象 就相当于是要生成一个excel 程序
  10. ws = wb.active #激活当前表
  11. ws.append(['标题','up主','播放量','评论量'])
  12. class Spider:
  13. def __init__(self):
  14. self.url = 'https://www.bilibili.com/v/popular/rank/dance'
  15. self.options = webdriver.ChromeOptions()
  16. self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
  17. self.browsers = webdriver.Chrome(options=self.options)
  18. # 访问哔哩哔哩排行榜
  19. def get_bili(self):
  20. self.browsers.get(self.url)
  21. wait = WebDriverWait(self.browsers, 280)
  22. wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'rank-item')))
  23. time.sleep(5)
  24. # 获取响应内容
  25. def get_data(self):
  26. last_height = self.browsers.execute_script("return document.body.scrollHeight")
  27. while True:
  28. self.browsers.execute_script('window.scrollTo(0, document.body.scrollHeight);')
  29. time.sleep(5)
  30. data = self.browsers.page_source # 获取网页源码
  31. self.parse_data(data=data)
  32. new_height = self.browsers.execute_script("return document.body.scrollHeight")
  33. if new_height == last_height:
  34. break
  35. last_height = new_height
  36. # 解析信息
  37. def parse_data(self, data):
  38. soup = BeautifulSoup(data, 'lxml')
  39. titles = soup.select('.info .title') # 标题
  40. up_names = soup.select('.info .up-name') # up主
  41. # :nth-of-type(2) 用于选择指定类型的第二个元素
  42. play_counts = soup.select('.info .detail-state .data-box:nth-of-type(1)') # 播放量
  43. comment_counts = soup.select('.info .detail-state .data-box:nth-of-type(2)') # 评论量
  44. rank_data = {}
  45. print(len(titles))
  46. for title, name, play_count, comment_count in zip(titles, up_names, play_counts, comment_counts):
  47. t = title.get_text().strip()
  48. n = name.get_text().strip()
  49. p = play_count.get_text().strip()
  50. c = comment_count.get_text().strip()
  51. print('标题:', t)
  52. print('up主:', n)
  53. print('播放量:', p)
  54. print('评论量:', c)
  55. print('==========================')
  56. self.save_data(t,n,p,c)
  57. rank_data['标题'] = t
  58. rank_data['up主'] = n
  59. rank_data['播放量'] = p
  60. rank_data['评论量'] = c
  61. with open('rank_data.json', 'a', encoding='utf-8') as f:
  62. f.write(json.dumps(rank_data, ensure_ascii=False) + '\n')
  63. #保存数据
  64. def save_data(self,title,name,paly,comment):
  65. ws.append([title,name,paly,comment])
  66. # 保存为Excel数据
  67. wb.save('哔哩哔哩排行榜数据.xlsx')
  68. if __name__ == '__main__':
  69. s = Spider()
  70. s.get_bili()
  71. s.get_data()

运行结果:

5abf89ee4853433b8ecc1a48f21da997.png

cbfa7daafb1046e4bd36bc338176cbd3.png

aa5d35c7398548eeb09ea5b1f04b53e9.png


本文转载自: https://blog.csdn.net/qq_53256193/article/details/142713956
版权归原作者 人生の三重奏 所有, 如有侵权,请联系我们删除。

“selenium案例——爬取哔哩哔哩排行榜”的评论:

还没有评论