selenium使用PhantomJS,主要有两个原因:
1、使用“无界面”浏览器操作;2、调用一些js函数实现一些功能,比如网页的长截图。
一、下载
1、selenium下载直接通过pip安装即可
但是新版本的selenium已经不在支持PhantomJS,因此需要安装旧版本的selenium(但是太旧也不行)
例如:
pip install selenium==2.48.0
2、下载Chromedriver
载与自己浏览器版本相对应的Chromedriver版本:
http://chromedriver.storage.googleapis.com/index.html
下载windows版本的32位,下载后解压,将.exe放入python安装路径
3、PhantomJS下载安装
1、PhantomJS需要下载安装,(有时还需要加入环境变量);
Download PhantomJS
2、下载后解压,将路径加入环境变量Path;
3、再将.exe复制到python的安装路径内;
4、pip 安装phantomjs库
二、测试代码
from selenium import webdriver
import selenium
driver = selenium.webdriver.PhantomJS()
# 没有环境变量时使用executable_path指定路径:
# driver = selenium.webdriver.PhantomJS(executable_path=r'D:\softs\phantomjs-2.1.1\phantomjs-2.1.1-windows/bin/phantomjs.exe')
driver.close()
# 测试webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('https://www.baidu.com/')
browser.close()
三、selenium、phantomjs执行js代码,替换浏览器里网页的节点的源码
# 替换webdriver浏览器里网页的节点(修改网页源码):
driver.execute_script('document.querySelector("body").innerHTML="{}";'.format(new_pagesource))
四、长截图
"""截网页长图"""
def get_long_png(web, no):
ss = time.time()
try:
# browser_js = webdriver.PhantomJS('phantomjs')
# browser_js.maximize_window()
# browser_js = web
# browser_js.execute_script(
# 'document.querySelector("body").innerHTML="{}";'.format(new_pagesource.replace('"', "'").replace('\n', '')))
# browser_js = web
time.sleep(10)
web.save_screenshot('.\\imgs\\{}.png'.format(no))
print('.\\imgs\\{}.png saved!'.format(no))
except BaseException as msg:
print(msg)
print("get pic use:" + str(time.time() - ss))
五、获取文本叶子节点及tag_path(Xpath)
"""获取文本叶子节点及tag_path text_node"""
def get_one_text_node(web, file_name):
# 删除一些无用节点
# 先用正则删除 注释
del_re = '<!--\s*((?!-->).[\s]*)*-->'
root = etree.HTML(re.sub(del_re, '', web.page_source))
# 注释、script、style、无文本节点
# 隐藏节点
del_nodes = root.xpath('//style|//script|//img|//base|//head|//link|//input|'
'//*[contains(@*[1],"hidden")]|'
'//*[contains(@*[2],"hidden")]|'
'//*[contains(@*[3],"hidden")]|'
'//*[contains(@*[4],"hidden")]|'
'//*[contains(@style,"hidden")]|'
'//*[contains(@type,"hidden")]')
for node in del_nodes:
parent = node.getparent()
if parent is not None:
parent.remove(node)
body = root.xpath('//body')[0]
all_text_nodes = body.xpath('.//*[text()]')
window_size = web.find_element_by_xpath('//html').size
docu_height = web.execute_script('return document.body.scrollHeight')
docu_width = web.execute_script('return document.body.scrollWidth')
window_size = {'width': max(window_size.get('width'), docu_width), 'height': max(window_size.get('height'), docu_height)}
text_leaf_nodes = []
text_leaf_nodes_tag_path = []
text_node_info_list = []
# from tqdm import tqdm
# for node in tqdm(all_text_nodes):
for node in all_text_nodes:
node_text = re.sub(r'\s+', ' ', "".join(node.xpath('./text()'))).strip()
if len(node_text) > 0:
text_leaf_nodes.append(node)
tag_path = getEtreeXPath(node, root)
text_leaf_nodes_tag_path.append(tag_path)
# node['tag_path'] = getEtreeXPath(node)
webelement = web.find_element_by_xpath(tag_path)
# 是否为不显示元素:
if webelement.size.get('height') == 0 and webelement.size.get('width') == 0:
continue
ttext = re.sub(r'[((][\s\S]*[))]', '', node_text).strip()
if len(re.findall(r'[^\x00-\xff]', ttext)) < 10 and \
len(re.findall(r'[A-Za-z-]+', ttext)) < 5:
continue
print(ttext)
one_node_info = TextNodeInfo(dom_node=node, tag_path=tag_path, webelement=webelement, window_size=window_size,
label_xpath_set=node_label_xpath.zw_xpath_set)
if len(one_node_info.text_content) > 0:
one_node_info.print(file_name)
text_node_info_list.append(one_node_info)
print("{} write finish".format(file_name))
六、js获取页面文档的 高度和宽度
# js获取页面文档的 高度和宽度(不是浏览器页面的宽高)
docu_height = web.execute_script('return document.body.scrollHeight')
docu_width = web.execute_script('return document.body.scrollWidth')
版权归原作者 qq_38767359 所有, 如有侵权,请联系我们删除。