使用selenium获取网址所加载所有资源url列表信息

程序猿 2021-02-27 21:57:03 2621浏览 加载中

抓取数据时,经常遇到有的数据是通过ajax异步调取的,如何通过selenium获取网址所加载的全部请求url地址了,即我们打开开发者工具里面network中记录的请求url列表,可以参考下面代码

# -*- coding=utf-8 -*-
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
url="http://www.phper163.com/";
chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--headless')
chrome_options.add_experimental_option('w3c', False)
 
caps = {
    'loggingPrefs': {
        'performance': 'ALL',
    }
}
driver = webdriver.Chrome(desired_capabilities=caps, options=chrome_options)
driver.get(url)
time.sleep(5)
requests = []
response = []
     
for log in driver.get_log('performance'):
    x = json.loads(log['message'])['message']
    if x["method"] == "Network.responseReceived":
        try:
            ip = x["params"]["response"]["remoteIPAddress"]
        except BaseException as p:
            print(p)
            ip = ""
        try:
            port = x["params"]["response"]["remotePort"]
        except BaseException as f:
            print(f)
            port = ""
        response.append(
            [
                x["params"]["response"]["url"],
                ip,
                port,
                x["params"]["response"]["status"],
                x["params"]["response"]["statusText"],
                x["params"]["type"]
            ]
        )
    elif x["method"] == "Network.requestWillBeSent":
        requests.append(
            [
                x["params"]["request"]["url"],
                x["params"]["initiator"]["type"],
                x["params"]["request"]["method"],
                x["params"]["type"]
            ]
        )
    else:
        pass
newlist = []
for iqurl in requests:
    qwelist = [iqurl]
    for ipurl in response:
        if iqurl[0] == ipurl[0]:
            qwelist.append(ipurl)
        else:
            pass
    newlist.append(qwelist)
for ipurl in response:
    p = 0
    for i in newlist:
        if len(i) == 1:
            pass
        else:
            if ipurl == i[1]:
                p += 1
            else:
                pass
    if p == 0:
        newlist.append(ipurl)
    else:
        pass
return_list = []
for a in newlist:
    dic = {
        "url": "",
        "method": "",
        "status": "",
        "statusText": "",
        "type": "",
        "initiator": "",
        "netloc": "",
        "remoteIPAddress": "",
        "remotePort": ""
 
    }
    if len(a) == 2:
        dic["url"] = a[0][0]
        dic["initiator"] = a[0][1]
        dic["method"] = a[0][2]
        dic["type"] = a[0][3]
        dic["remoteIPAddress"] = a[1][1]
        dic["remotePort"] = a[1][2]
        dic["status"] = a[1][3]
        dic["statusText"] = a[1][4]
        return_list.append(dic)
    elif len(a) == 1:
        if len(a[0]) == 4:
            dic["url"] = a[0][0]
            dic["initiator"] = a[0][1]
            dic["method"] = a[0][2]
            dic["type"] = a[0][3]
            return_list.append(dic)
        elif len(a[0]) == 6:
            dic["url"] = a[0][0]
            dic["remoteIPAddress"] = a[0][1]
            dic["remotePort"] = a[0][2]
            dic["status"] = a[0][3]
            dic["statusText"] = a[0][4]
            dic["type"] = a[0][5]
            return_list.append(dic)
        else:
            pass
    else:
        pass
driver.close()
driver.quit()
 
print(return_list)

调用selenium,开启selenium的日志收集功能,收集所有日志,并从中挑出network部分,分析格式化数据,取出需要的数据

标签: selenium Python
最后修改:2024-04-16 17:09:14

非特殊说明,本博所有文章均为博主原创。