import random from playwright.sync_api import sync_playwright import json import os import time COOKIE_PATH = "aiqicha_cookies.json" class AiqichaCrawler: def __init__(self): self.browser = None self.context = None self.page = None def anti_detection(self): """注入更全面的反检测脚本""" self.page.add_init_script(""" // 隐藏webdriver属性 delete navigator.__proto__.webdriver; // 伪装chrome属性 Object.defineProperty(navigator, 'chrome', { value: { runtime: {}, loadTimes: function() {} }, writable: false, enumerable: true, configurable: true }); // 伪装plugins和mimeTypes Object.defineProperty(navigator, 'plugins', { get: () => [ { 0: { type: 'application/pdf' } }, { 0: { type: 'application/x-google-chrome-pdf' } } ], }); Object.defineProperty(navigator, 'mimeTypes', { get: () => [ { type: 'application/pdf' }, { type: 'application/x-google-chrome-pdf' } ], }); // 伪装languages Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'], }); // 禁用调试功能 window.console.debug = function() {}; window.console.log = function() {}; // 伪装屏幕信息 Object.defineProperty(screen, 'width', {get: () => 1366}); Object.defineProperty(screen, 'height', {get: () => 768}); Object.defineProperty(screen, 'availWidth', {get: () => 1366}); Object.defineProperty(screen, 'availHeight', {get: () => 768}); Object.defineProperty(screen, 'colorDepth', {get: () => 24}); Object.defineProperty(screen, 'pixelDepth', {get: () => 24}); // 伪装时间戳 window.chrome = { runtime: {} }; // 伪装outerHeight和outerWidth Object.defineProperty(window, 'outerHeight', {get: () => 768}); Object.defineProperty(window, 'outerWidth', {get: () => 1366}); // 伪装innerHeight和innerWidth Object.defineProperty(window, 'innerHeight', {get: () => 768}); Object.defineProperty(window, 'innerWidth', {get: () => 1366}); """) def random_behavior(self): """模拟更复杂的人类操作""" # 随机等待 time.sleep(random.uniform(2, 5)) # 随机鼠标移动 for _ in range(random.randint(3, 7)): self.page.mouse.move( random.randint(100, 1200), random.randint(100, 600) ) time.sleep(random.uniform(0.1, 0.8)) # 随机滚动页面 if random.choice([True, False]): scroll_distance = random.randint(200, 800) self.page.mouse.wheel(0, scroll_distance) time.sleep(random.uniform(1, 2)) def init_cookie_file(self): if not os.path.exists(COOKIE_PATH): with open(COOKIE_PATH, 'w') as f: json.dump([], f) def save_cookies(self): cookies = self.context.cookies() with open(COOKIE_PATH, 'w') as f: json.dump(cookies, f, indent=2) def load_cookies(self): try: with open(COOKIE_PATH, 'r') as f: cookies = json.load(f) if cookies: self.context.add_cookies(cookies) return True return False except: return False def bypass_debugger(self): self.page.add_init_script(""" window.Function.prototype.constructor = function() {}; window.console.debug = function(){}; Object.defineProperty(navigator, 'webdriver', {get: () => false}); """) def check_login_status(self): """检测登录状态,返回True表示已登录""" try: # 先关闭可能的功能上新弹窗 self.close_feature_popup() # 等待页面加载完成 self.page.wait_for_load_state("networkidle") # 优先检查 .header-user-center-menu 元素判断是否已登录 logged_in_elements = self.page.query_selector_all('.header-user-center-menu, .user-center') for element in logged_in_elements: if element and element.is_visible(): print("检测到已登录状态") return True # 检测用户中心元素判断已登录 user_center = self.page.query_selector('.user-center') if user_center and user_center.is_visible(): print("检测到已登录状态") return True # 检测登录相关元素 #self.page.wait_for_selector('.ivu-tooltip-light', timeout=10000) #self.page.wait_for_selector('img[src*="app-qrcode.png"]', timeout=20000) #print("检测到未登录状态") #return False # 检测登录相关元素 login_element = self.page.query_selector('.login') if login_element and login_element.is_visible(): print("检测到未登录状态") return False except: try: # 检测用户中心元素判断已登录 self.page.wait_for_selector('.user-center', timeout=3000) print("检测到已登录状态") return True except: print("登录状态检测异常") return False def close_feature_popup(self): """关闭功能上新弹窗""" try: # 查找并点击关闭按钮 close_buttons = self.page.query_selector_all('.close-icon.ivu-icon-ios-close') for close_button in close_buttons: if close_button.is_visible(): close_button.click() print("已关闭功能上新弹窗") # 等待弹窗消失 time.sleep(1) break except Exception as e: # 如果没有找到弹窗,继续执行 pass def login(self): """带状态检测的登录流程""" self.page.goto("https://aiqicha.baidu.com") # 页面加载后执行反检测 self.page.evaluate(""" delete navigator.__proto__.webdriver; """) # 等待页面加载完成 self.page.wait_for_load_state("networkidle") # 关闭可能的功能上新弹窗 self.close_feature_popup() if not self.check_login_status(): print("开始执行登录流程...") # 点击登录按钮 login_btn = self.page.wait_for_selector('.login', timeout=20000) login_btn.click() # try: # 等待二维码容器出现并确保可见 # self.page.wait_for_selector('.app-qrcode', timeout=20000) print("请扫描页面二维码登录...") time.sleep(3) # 给一些时间让二维码完全加载 # 等待登录完成 # 等待登录完成,先尝试URL检测,失败后再尝试元素检测 # try: # self.page.wait_for_url("https://aiqicha.baidu.com/usercenter/**", timeout=5000) # except: # # 如果URL检测失败,尝试通过元素检测 # self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000) # 如果URL检测失败,尝试通过元素检测 self.page.wait_for_selector('.header-user-center-menu, .user-center', timeout=10000) # self.page.wait_for_url("**/usercenter**", timeout=120000) self.save_cookies() print("登录成功!") def search_company(self, company_name): self.page.goto(f"https://aiqicha.baidu.com/s?q={company_name}") # 页面加载后执行反检测 self.page.evaluate(""" delete navigator.__proto__.webdriver; """) # 关闭可能的功能上新弹窗 self.close_feature_popup() self.page.wait_for_selector(".search-item", timeout=10000) # 提取企业基础信息 company_card = self.page.query_selector(".search-item") return { "name": company_card.query_selector(".company-name").inner_text(), "legal_person": company_card.query_selector(".legal-person").inner_text(), "reg_capital": company_card.query_selector(".reg-capital").inner_text(), "status": company_card.query_selector(".company-status").inner_text() } def run(self, companies): self.init_cookie_file() with sync_playwright() as p: # self.browser = p.chromium.launch(headless=False) self.browser = p.chromium.launch( headless=False, args=[ "--disable-blink-features=AutomationControlled", "--disable-infobars", "--disable-extensions", "--disable-plugins", "--no-sandbox", "--disable-dev-shm-usage", "--disable-web-security", "--disable-features=IsolateOrigins,site-per-process", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-renderer-backgrounding", "--disable-ipc-flooding-protection" ] ) # self.context = self.browser.new_context() self.context = self.browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", viewport={"width": 1366, "height": 768}, device_scale_factor=1, is_mobile=False, has_touch=False, locale="zh-CN", timezone_id="Asia/Shanghai" ) self.page = self.context.new_page() self.anti_detection() # 立即执行一次反检测 self.page.evaluate(""" delete navigator.__proto__.webdriver; """) self.random_behavior() if not self.load_cookies(): print("未找到有效Cookie,开始登录流程...") self.login() else: print("已加载Cookie,验证登录状态...") # 加载cookie后访问页面验证是否真正登录 self.page.goto("https://aiqicha.baidu.com") # 等待页面加载完成 self.page.wait_for_load_state("networkidle") # 验证登录状态 if not self.check_login_status(): print("Cookie已过期或无效,重新登录...") self.login() else: print("Cookie有效,已登录") for company in companies: try: data = self.search_company(company) print(f"{data['name']} | 法人:{data['legal_person']} | 注册资本:{data['reg_capital']}") self.save_cookies() # 每次操作后更新cookies time.sleep(3) # 防止请求过快 except Exception as e: print(f"查询 {company} 失败: {str(e)}") self.context.close() self.browser.close() if __name__ == "__main__": crawler = AiqichaCrawler() companies = ["阿里巴巴", "腾讯科技", "华为技术"] crawler.run(companies)