320 lines
12 KiB
Python
320 lines
12 KiB
Python
import random
|
||
|
||
from playwright.sync_api import sync_playwright
|
||
import json
|
||
import os
|
||
import time
|
||
|
||
COOKIE_PATH = "aiqicha_cookies.json"
|
||
|
||
|
||
class AiqichaCrawler:
|
||
def __init__(self):
|
||
self.browser = None
|
||
self.context = None
|
||
self.page = None
|
||
|
||
def anti_detection(self):
|
||
"""注入更全面的反检测脚本"""
|
||
self.page.add_init_script("""
|
||
// 隐藏webdriver属性
|
||
delete navigator.__proto__.webdriver;
|
||
|
||
// 伪装chrome属性
|
||
Object.defineProperty(navigator, 'chrome', {
|
||
value: {
|
||
runtime: {},
|
||
loadTimes: function() {}
|
||
},
|
||
writable: false,
|
||
enumerable: true,
|
||
configurable: true
|
||
});
|
||
|
||
// 伪装plugins和mimeTypes
|
||
Object.defineProperty(navigator, 'plugins', {
|
||
get: () => [
|
||
{ 0: { type: 'application/pdf' } },
|
||
{ 0: { type: 'application/x-google-chrome-pdf' } }
|
||
],
|
||
});
|
||
|
||
Object.defineProperty(navigator, 'mimeTypes', {
|
||
get: () => [
|
||
{ type: 'application/pdf' },
|
||
{ type: 'application/x-google-chrome-pdf' }
|
||
],
|
||
});
|
||
|
||
// 伪装languages
|
||
Object.defineProperty(navigator, 'languages', {
|
||
get: () => ['zh-CN', 'zh'],
|
||
});
|
||
|
||
// 禁用调试功能
|
||
window.console.debug = function() {};
|
||
window.console.log = function() {};
|
||
|
||
// 伪装屏幕信息
|
||
Object.defineProperty(screen, 'width', {get: () => 1366});
|
||
Object.defineProperty(screen, 'height', {get: () => 768});
|
||
Object.defineProperty(screen, 'availWidth', {get: () => 1366});
|
||
Object.defineProperty(screen, 'availHeight', {get: () => 768});
|
||
Object.defineProperty(screen, 'colorDepth', {get: () => 24});
|
||
Object.defineProperty(screen, 'pixelDepth', {get: () => 24});
|
||
|
||
// 伪装时间戳
|
||
window.chrome = {
|
||
runtime: {}
|
||
};
|
||
|
||
// 伪装outerHeight和outerWidth
|
||
Object.defineProperty(window, 'outerHeight', {get: () => 768});
|
||
Object.defineProperty(window, 'outerWidth', {get: () => 1366});
|
||
|
||
// 伪装innerHeight和innerWidth
|
||
Object.defineProperty(window, 'innerHeight', {get: () => 768});
|
||
Object.defineProperty(window, 'innerWidth', {get: () => 1366});
|
||
""")
|
||
|
||
def random_behavior(self):
|
||
"""模拟更复杂的人类操作"""
|
||
# 随机等待
|
||
time.sleep(random.uniform(2, 5))
|
||
|
||
# 随机鼠标移动
|
||
for _ in range(random.randint(3, 7)):
|
||
self.page.mouse.move(
|
||
random.randint(100, 1200),
|
||
random.randint(100, 600)
|
||
)
|
||
time.sleep(random.uniform(0.1, 0.8))
|
||
|
||
# 随机滚动页面
|
||
if random.choice([True, False]):
|
||
scroll_distance = random.randint(200, 800)
|
||
self.page.mouse.wheel(0, scroll_distance)
|
||
time.sleep(random.uniform(1, 2))
|
||
def init_cookie_file(self):
|
||
if not os.path.exists(COOKIE_PATH):
|
||
with open(COOKIE_PATH, 'w') as f:
|
||
json.dump([], f)
|
||
|
||
def save_cookies(self):
|
||
cookies = self.context.cookies()
|
||
with open(COOKIE_PATH, 'w') as f:
|
||
json.dump(cookies, f, indent=2)
|
||
|
||
def load_cookies(self):
|
||
try:
|
||
with open(COOKIE_PATH, 'r') as f:
|
||
cookies = json.load(f)
|
||
if cookies:
|
||
self.context.add_cookies(cookies)
|
||
return True
|
||
return False
|
||
except:
|
||
return False
|
||
|
||
def bypass_debugger(self):
|
||
self.page.add_init_script("""
|
||
window.Function.prototype.constructor = function() {};
|
||
window.console.debug = function(){};
|
||
Object.defineProperty(navigator, 'webdriver', {get: () => false});
|
||
""")
|
||
|
||
def check_login_status(self):
|
||
"""检测登录状态,返回True表示已登录"""
|
||
try:
|
||
# 先关闭可能的功能上新弹窗
|
||
self.close_feature_popup()
|
||
|
||
# 等待页面加载完成
|
||
self.page.wait_for_load_state("networkidle")
|
||
|
||
# 优先检查 .header-user-center-menu 元素判断是否已登录
|
||
logged_in_elements = self.page.query_selector_all('.header-user-center-menu, .user-center')
|
||
for element in logged_in_elements:
|
||
if element and element.is_visible():
|
||
print("检测到已登录状态")
|
||
return True
|
||
|
||
# 检测用户中心元素判断已登录
|
||
user_center = self.page.query_selector('.user-center')
|
||
if user_center and user_center.is_visible():
|
||
print("检测到已登录状态")
|
||
return True
|
||
|
||
|
||
# 检测登录相关元素
|
||
#self.page.wait_for_selector('.ivu-tooltip-light', timeout=10000)
|
||
#self.page.wait_for_selector('img[src*="app-qrcode.png"]', timeout=20000)
|
||
#print("检测到未登录状态")
|
||
#return False
|
||
|
||
# 检测登录相关元素
|
||
login_element = self.page.query_selector('.login')
|
||
if login_element and login_element.is_visible():
|
||
print("检测到未登录状态")
|
||
return False
|
||
except:
|
||
try:
|
||
# 检测用户中心元素判断已登录
|
||
self.page.wait_for_selector('.user-center', timeout=3000)
|
||
print("检测到已登录状态")
|
||
return True
|
||
except:
|
||
print("登录状态检测异常")
|
||
return False
|
||
|
||
def close_feature_popup(self):
|
||
"""关闭功能上新弹窗"""
|
||
try:
|
||
# 查找并点击关闭按钮
|
||
close_buttons = self.page.query_selector_all('.close-icon.ivu-icon-ios-close')
|
||
for close_button in close_buttons:
|
||
if close_button.is_visible():
|
||
close_button.click()
|
||
print("已关闭功能上新弹窗")
|
||
# 等待弹窗消失
|
||
time.sleep(1)
|
||
break
|
||
except Exception as e:
|
||
# 如果没有找到弹窗,继续执行
|
||
pass
|
||
def login(self):
|
||
"""带状态检测的登录流程"""
|
||
self.page.goto("https://aiqicha.baidu.com")
|
||
# 页面加载后执行反检测
|
||
self.page.evaluate("""
|
||
delete navigator.__proto__.webdriver;
|
||
""")
|
||
|
||
# 等待页面加载完成
|
||
self.page.wait_for_load_state("networkidle")
|
||
|
||
# 关闭可能的功能上新弹窗
|
||
self.close_feature_popup()
|
||
|
||
if not self.check_login_status():
|
||
print("开始执行登录流程...")
|
||
# 点击登录按钮
|
||
login_btn = self.page.wait_for_selector('.login', timeout=20000)
|
||
login_btn.click()
|
||
# try:
|
||
# 等待二维码容器出现并确保可见
|
||
# self.page.wait_for_selector('.app-qrcode', timeout=20000)
|
||
print("请扫描页面二维码登录...")
|
||
time.sleep(3) # 给一些时间让二维码完全加载
|
||
|
||
|
||
# 等待登录完成
|
||
# 等待登录完成,先尝试URL检测,失败后再尝试元素检测
|
||
# try:
|
||
# self.page.wait_for_url("https://aiqicha.baidu.com/usercenter/**", timeout=5000)
|
||
# except:
|
||
# # 如果URL检测失败,尝试通过元素检测
|
||
# self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000)
|
||
|
||
# 如果URL检测失败,尝试通过元素检测
|
||
self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000)
|
||
# self.page.wait_for_url("**/usercenter**", timeout=120000)
|
||
self.save_cookies()
|
||
print("登录成功!")
|
||
|
||
def search_company(self, company_name):
|
||
self.page.goto(f"https://aiqicha.baidu.com/s?q={company_name}")
|
||
# 页面加载后执行反检测
|
||
self.page.evaluate("""
|
||
delete navigator.__proto__.webdriver;
|
||
""")
|
||
|
||
# 关闭可能的功能上新弹窗
|
||
self.close_feature_popup()
|
||
|
||
self.page.wait_for_selector(".search-item", timeout=10000)
|
||
|
||
# 提取企业基础信息
|
||
company_card = self.page.query_selector(".search-item")
|
||
return {
|
||
"name": company_card.query_selector(".company-name").inner_text(),
|
||
"legal_person": company_card.query_selector(".legal-person").inner_text(),
|
||
"reg_capital": company_card.query_selector(".reg-capital").inner_text(),
|
||
"status": company_card.query_selector(".company-status").inner_text()
|
||
}
|
||
|
||
def run(self, companies):
|
||
self.init_cookie_file()
|
||
|
||
with sync_playwright() as p:
|
||
# self.browser = p.chromium.launch(headless=False)
|
||
self.browser = p.chromium.launch(
|
||
headless=False,
|
||
args=[
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--disable-infobars",
|
||
"--disable-extensions",
|
||
"--disable-plugins",
|
||
"--no-sandbox",
|
||
"--disable-dev-shm-usage",
|
||
"--disable-web-security",
|
||
"--disable-features=IsolateOrigins,site-per-process",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-backgrounding-occluded-windows",
|
||
"--disable-renderer-backgrounding",
|
||
"--disable-ipc-flooding-protection"
|
||
]
|
||
)
|
||
# self.context = self.browser.new_context()
|
||
self.context = self.browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||
viewport={"width": 1366, "height": 768},
|
||
device_scale_factor=1,
|
||
is_mobile=False,
|
||
has_touch=False,
|
||
locale="zh-CN",
|
||
timezone_id="Asia/Shanghai"
|
||
)
|
||
self.page = self.context.new_page()
|
||
self.anti_detection()
|
||
# 立即执行一次反检测
|
||
self.page.evaluate("""
|
||
delete navigator.__proto__.webdriver;
|
||
""")
|
||
self.random_behavior()
|
||
|
||
if not self.load_cookies():
|
||
print("未找到有效Cookie,开始登录流程...")
|
||
self.login()
|
||
else:
|
||
print("已加载Cookie,验证登录状态...")
|
||
# 加载cookie后访问页面验证是否真正登录
|
||
self.page.goto("https://aiqicha.baidu.com")
|
||
# 等待页面加载完成
|
||
self.page.wait_for_load_state("networkidle")
|
||
|
||
# 验证登录状态
|
||
if not self.check_login_status():
|
||
print("Cookie已过期或无效,重新登录...")
|
||
self.login()
|
||
else:
|
||
print("Cookie有效,已登录")
|
||
|
||
for company in companies:
|
||
try:
|
||
data = self.search_company(company)
|
||
print(f"{data['name']} | 法人:{data['legal_person']} | 注册资本:{data['reg_capital']}")
|
||
self.save_cookies() # 每次操作后更新cookies
|
||
time.sleep(3) # 防止请求过快
|
||
except Exception as e:
|
||
print(f"查询 {company} 失败: {str(e)}")
|
||
|
||
self.context.close()
|
||
self.browser.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
crawler = AiqichaCrawler()
|
||
companies = ["阿里巴巴", "腾讯科技", "华为技术"]
|
||
crawler.run(companies)
|