# file: tool/aiqicha_detail.py import time import json from tool.web_browser import WebBrowser from tool.aiqicha_login import AiqichaLoginManager # 导入登录管理器 from tool.aiqicha_detail_parser import AiqichaDetailParser # 导入解析器 class AiqichaDetailCrawler: def __init__(self, cookie_path="cookies/aiqicha_cookies.json"): self.browser = WebBrowser(cookie_path) self.browser_started = False self.login_manager = None # 添加登录管理器实例 def start_browser(self): """启动浏览器""" if not self.browser_started: try: self.browser.start_browser() # 初始化登录管理器 self.login_manager = AiqichaLoginManager(self.browser) # 加载cookies if not self.browser.load_cookies(): print("未找到有效Cookie") else: print("已加载Cookie") # # 使用登录管理器检测登录状态 # logined = self.login_manager.check_and_login() # if logined: # print("登录成功") # else: # print("登录失败") self.browser_started = True except Exception as e: print(f"启动浏览器失败: {e}") self.browser_started = False def close_browser(self): """关闭浏览器""" if self.browser_started: try: # 保存cookies self.browser.save_cookies() self.browser.close_browser() except Exception as e: print(f"关闭浏览器时出错: {e}") finally: self.browser_started = False def close_svip_popups(self): """ 关闭页面中的SVIP弹窗 """ # dialog-close try: # 查找所有对话框关闭按钮元素 close_buttons = self.browser.page.query_selector_all('div.dialog-close') print(f'找到 {len(close_buttons)} 个对话框关闭按钮') # 遍历所有关闭按钮,点击可见的按钮 for i, button in enumerate(close_buttons): if button and button.is_visible(): print(f"点击第 {i + 1} 个可见的对话框关闭按钮") button.click() time.sleep(1) # 等待对话框关闭 print("已完成关闭可见的对话框") except Exception as e: print(f"关闭对话框时出错: {e}") if True: return try: # 查找所有关闭按钮元素 # close_buttons = self.browser.page.query_selector_all('a.ivu-modal-close') print(f'找到 {len(close_buttons)} 个关闭按钮') # 遍历所有关闭按钮,点击可见的按钮 for i, button in enumerate(close_buttons): if button and button.is_visible(): print(f"点击第 {i + 1} 个可见的关闭按钮") button.click() time.sleep(1) # 等待弹窗关闭 print("已完成关闭可见的模态框") except Exception as e: print(f"关闭弹窗时出错: {e}") def crawl_company_detail(self, url: str, refer_url: str = None): """ 爬取爱企查企业详情页数据 Args: url (str): 企业详情页URL,例如 https://aiqicha.baidu.com/company_detail_45719927199916 Returns: dict: 包含企业详细信息的字典 """ if not self.browser_started: self.start_browser() if not self.browser_started: print("浏览器未启动,无法执行爬取") return {} print(f'正在爬取企业详情: {url}') try: # 设置 Referer 头部模拟搜索引擎点击 if refer_url: self.browser.page.set_extra_http_headers({"Referer": refer_url}) # 访问页面 if self.browser.visit_page(url): self.close_svip_popups() # 保存当前页面的html 到 demo/html/aiqicha-datail.html # 目录不存在时创建 try: self.browser.page.wait_for_selector('.header-user-center', timeout=10000) print(f".header-user-center1: 等待页面元素ok") except Exception as e: print(f".header-user-center1: 等待页面元素时出错: {e}") self.browser.save_page_html("demo/html/aiqicha-datail.html") try: # 增强页面加载检查 # 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素) self.close_svip_popups() self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups.html") # 使用登录管理器检测登录状态 login = self.login_manager.check_and_login() if login: print("crawl_company_detail:登录成功") else: print("crawl_company_detail:登录失败") self.browser.save_page_html("demo/html/aiqicha-datail-afterchecklogin.html") # 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素) self.close_svip_popups() self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups2.html") # 等待关键元素加载,增加超时时间 print("等待页面关键元素加载...") try: self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000) print("crawl_company_detail:.addr-enter-bg-ele success") except Exception as e: print(f"等待页面元素时出错: {e}") # self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000) print("关键元素已加载") # 额外等待一段时间确保页面完全加载 import time time.sleep(3) print("额外等待完成,页面应该已完全加载") except Exception as e: print(f"等待页面元素时出错: {e}") print("继续尝试解析页面内容...") self.browser.save_cookies() # 提取基本信息 print("开始解析页面信息...") parser = AiqichaDetailParser(self.browser) company_info = parser.parse_company_info() print(f"成功爬取企业信息: {company_info['name']}") return company_info else: print("访问页面失败") return {} except Exception as e: print(f"爬取过程中出现错误: {e}") return {} def __enter__(self): """上下文管理器入口""" self.start_browser() return self def __exit__(self, exc_type, exc_val, exc_tb): """上下文管理器出口""" self.close_browser() # 使用示例: # 方法1: 手动管理浏览器生命周期 # crawler = QiqichaDetailCrawler() # crawler.start_browser() # detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916") # crawler.close_browser() # 方法2: 使用上下文管理器 # with QiqichaDetailCrawler() as crawler: # detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916") # print(detail)