Files
SearchCompany/tool/aiqicha_detail.py
manchuwork de3c97e828 aiqicha
2025-11-03 18:57:58 +08:00

209 lines
7.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# file: tool/aiqicha_detail.py
import time
import json
from tool.web_browser import WebBrowser
from tool.aiqicha_login import AiqichaLoginManager # 导入登录管理器
from tool.aiqicha_detail_parser import AiqichaDetailParser # 导入解析器
class AiqichaDetailCrawler:
def __init__(self, cookie_path="cookies/aiqicha_cookies.json"):
self.browser = WebBrowser(cookie_path)
self.browser_started = False
self.login_manager = None # 添加登录管理器实例
def start_browser(self):
"""启动浏览器"""
if not self.browser_started:
try:
self.browser.start_browser()
# 初始化登录管理器
self.login_manager = AiqichaLoginManager(self.browser)
# 加载cookies
if not self.browser.load_cookies():
print("未找到有效Cookie")
else:
print("已加载Cookie")
# # 使用登录管理器检测登录状态
# logined = self.login_manager.check_and_login()
# if logined:
# print("登录成功")
# else:
# print("登录失败")
self.browser_started = True
except Exception as e:
print(f"启动浏览器失败: {e}")
self.browser_started = False
def close_browser(self):
"""关闭浏览器"""
if self.browser_started:
try:
# 保存cookies
self.browser.save_cookies()
self.browser.close_browser()
except Exception as e:
print(f"关闭浏览器时出错: {e}")
finally:
self.browser_started = False
def close_svip_popups(self):
"""
关闭页面中的SVIP弹窗
"""
# dialog-close
try:
# 查找所有对话框关闭按钮元素
close_buttons = self.browser.page.query_selector_all('div.dialog-close')
print(f'找到 {len(close_buttons)} 个对话框关闭按钮')
# 遍历所有关闭按钮,点击可见的按钮
for i, button in enumerate(close_buttons):
if button and button.is_visible():
print(f"点击第 {i + 1} 个可见的对话框关闭按钮")
button.click()
time.sleep(1) # 等待对话框关闭
print("已完成关闭可见的对话框")
except Exception as e:
print(f"关闭对话框时出错: {e}")
if True:
return
try:
# 查找所有关闭按钮元素
# <a class="ivu-modal-close"><i class="ivu-icon ivu-icon-ios-close">
close_buttons = self.browser.page.query_selector_all('a.ivu-modal-close')
print(f'找到 {len(close_buttons)} 个关闭按钮')
# 遍历所有关闭按钮,点击可见的按钮
for i, button in enumerate(close_buttons):
if button and button.is_visible():
print(f"点击第 {i + 1} 个可见的关闭按钮")
button.click()
time.sleep(1) # 等待弹窗关闭
print("已完成关闭可见的模态框")
except Exception as e:
print(f"关闭弹窗时出错: {e}")
def crawl_company_detail(self, url: str, refer_url: str = None):
"""
爬取爱企查企业详情页数据
Args:
url (str): 企业详情页URL例如 https://aiqicha.baidu.com/company_detail_45719927199916
Returns:
dict: 包含企业详细信息的字典
"""
if not self.browser_started:
self.start_browser()
if not self.browser_started:
print("浏览器未启动,无法执行爬取")
return {}
print(f'正在爬取企业详情: {url}')
try:
# 设置 Referer 头部模拟搜索引擎点击
if refer_url:
self.browser.page.set_extra_http_headers({"Referer": refer_url})
# 访问页面
if self.browser.visit_page(url):
self.close_svip_popups()
# 保存当前页面的html 到 demo/html/aiqicha-datail.html
# 目录不存在时创建
try:
self.browser.page.wait_for_selector('.header-user-center', timeout=10000)
print(f".header-user-center1: 等待页面元素ok")
except Exception as e:
print(f".header-user-center1: 等待页面元素时出错: {e}")
self.browser.save_page_html("demo/html/aiqicha-datail.html")
try:
# 增强页面加载检查
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
self.close_svip_popups()
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups.html")
# 使用登录管理器检测登录状态
login = self.login_manager.check_and_login()
if login:
print("crawl_company_detail登录成功")
else:
print("crawl_company_detail登录失败")
self.browser.save_page_html("demo/html/aiqicha-datail-afterchecklogin.html")
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
self.close_svip_popups()
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups2.html")
# 等待关键元素加载,增加超时时间
print("等待页面关键元素加载...")
try:
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
print("crawl_company_detail.addr-enter-bg-ele success")
except Exception as e:
print(f"等待页面元素时出错: {e}")
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
print("关键元素已加载")
# 额外等待一段时间确保页面完全加载
import time
time.sleep(3)
print("额外等待完成,页面应该已完全加载")
except Exception as e:
print(f"等待页面元素时出错: {e}")
print("继续尝试解析页面内容...")
self.browser.save_cookies()
# 提取基本信息
print("开始解析页面信息...")
parser = AiqichaDetailParser(self.browser)
company_info = parser.parse_company_info()
print(f"成功爬取企业信息: {company_info['name']}")
return company_info
else:
print("访问页面失败")
return {}
except Exception as e:
print(f"爬取过程中出现错误: {e}")
return {}
def __enter__(self):
"""上下文管理器入口"""
self.start_browser()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""上下文管理器出口"""
self.close_browser()
# 使用示例:
# 方法1: 手动管理浏览器生命周期
# crawler = QiqichaDetailCrawler()
# crawler.start_browser()
# detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916")
# crawler.close_browser()
# 方法2: 使用上下文管理器
# with QiqichaDetailCrawler() as crawler:
# detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916")
# print(detail)