根据关键词自动搜索并爬取网页的信息
网页有两种情况:可以直接获取页数的和不可以直接获取页数的;
两种情况可以采取不同的方法:
情况一:先爬取页数,再爬取每页的数据
# coding=utf-8import pandas as pd
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import re
import random
option = webdriver.ChromeOptions()
option.add_argument("headless")# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe", options=option)
head_url ="部分的头部URL+key="
keywords_all =[]
keywords = keywords_all[410:444]
keyword_list =[]
product_name_list =[]
company_name_list =[]
company_url_list =[]
phone_list =[]defPageNumber(keyword):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd +';use_cas=0;f=pclist;p=0'
driver.get(turn_url)# print(driver.page_source)
time.sleep(random.randint(1,3))try:
source = driver.find_element(By.XPATH
,"//div[@class='gys']/dl/dt/span").text
reg = re.findall(r".*有(.*)家", source)
page_number =int(reg[0])print("共有",page_number,"条数据")return page_number
except:return-1defGetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd +';use_cas=0;f=pclist;p='+str(page)print(turn_url)try:
driver.get(turn_url)
time.sleep(random.randint(2,4))list= driver.find_elements(By.XPATH
,"//div[@class='gys']/dl/dd/form")for l inlist:
company = l.find_element(By.XPATH,"./table/tbody/tr/td/a").text
print(company)
company_name_list.append(company)
company_url = l.find_element(By.XPATH,"./table/tbody/tr/td/a[1]").get_attribute('href')print(company_url)
company_url_list.append(company_url)
phone = l.find_element(By.XPATH,"./table/tbody/tr[2]/td[2]").text
print(phone)
phone_list.append(phone)print(keyword)
keyword_list.append(keyword)except:print('get不到页面')for i in keywords:
this_page =0
page_number =int((PageNumber(keyword=i))/10)if page_number ==0:try:
GetResult(keyword=i, page=0)except:continueelif page_number ==-1:print(i,'无数据')else:for p inrange(0,page_number):try:
GetResult(keyword=i, page=p)except:continue
data_list =[]for a, b, c, d inzip(keyword_list, company_name_list, company_url_list, phone_list):
x ={}
x['keyword']= a
x['company_name']= b
x['company_url']= c
x['phone']= d
data_list.append(x)# print(data_list)withopen(r"###.csv",'w', newline='', encoding='UTF-8')as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword','company_name','company_url','phone'])for nl in data_list:
writer.writerow(nl.values())print("写入完成!")
情况二:无法爬取到页码数,只能换页爬取的
# coding=utf-8import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import random
import pandas as pd
option = webdriver.ChromeOptions()
option.add_argument("headless")# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe, options=option)
head_url ="部分头url+keyword="
keywords_all =[]
keywords = keywords_all[400:444]
keyword_list =[]
product_name_list =[]
company_name_list =[]
company_url_list =[]
mobilephone_list =[]
telephone_list =[]defNextPage(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))if page ==0:
turn_url = head_url + wd
else:
turn_url = head_url + wd +"&p="+str(page)print(turn_url)
driver.get(turn_url)
time.sleep(random.randint(1,3))list= driver.find_elements(By.XPATH
,"//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")returnlen(list)defGetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))if page ==0:
turn_url = head_url + wd
else:
turn_url = head_url + wd +"&p="+str(page)
driver.get(turn_url)
time.sleep(random.randint(3,5))try:list= driver.find_elements(By.XPATH
,"//div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")for l inlist:
product_name = l.find_element(By.XPATH,"./div[@class='pro-info']/div[@class='intro-box']/div[@class='tt']/a").text
print(product_name)
product_name_list.append(product_name)try:
telephone = l.find_element(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[2]").text
print(telephone)
telephone_list.append(telephone)
mobilephone = l.find_element(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[3]").text
print(mobilephone)
mobilephone_list.append(mobilephone)except:continue
company = l.find_element(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em").text
print(company)
company_name_list.append(company)for link in l.find_elements(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em/a"):
company_url = link.get_attribute('href')print(company_url)
company_url_list.append(company_url)print(keyword)
keyword_list.append(keyword)except:print("爬取失败")for i in keywords:
this_page =0while NextPage(keyword = i, page = this_page)>19:
GetResult(keyword=i, page=this_page)
this_page = this_page +1if NextPage(keyword = i, page = this_page)<20:
GetResult(keyword=i, page=this_page)
data_list =[]for a, b, c, d, e, f inzip(keyword_list, product_name_list, company_name_list, company_url_list, mobilephone_list, telephone_list):
x ={}
x['keyword']= a
x['product_name']= b
x['company_name']= c
x['company_url']= d
x['mobilephone']= e
x['telephone']= f
data_list.append(x)# print(data_list)withopen("###.csv",'w', newline='', encoding='UTF-8')as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword','product_name','company_name','company_url','mobilephone','telephone'])for nl in data_list:
writer.writerow(nl.values())print("写入完成!")
本文转载自: https://blog.csdn.net/The_Ruthless/article/details/128101139
版权归原作者 一个无情的靓女 所有, 如有侵权,请联系我们删除。
版权归原作者 一个无情的靓女 所有, 如有侵权,请联系我们删除。