209 lines
7.8 KiB
Python
209 lines
7.8 KiB
Python
# file: tool/aiqicha_detail.py
|
||
|
||
import time
|
||
import json
|
||
from tool.web_browser import WebBrowser
|
||
from tool.aiqicha_login import AiqichaLoginManager # 导入登录管理器
|
||
from tool.aiqicha_detail_parser import AiqichaDetailParser # 导入解析器
|
||
|
||
class AiqichaDetailCrawler:
|
||
def __init__(self, cookie_path="cookies/aiqicha_cookies.json"):
|
||
self.browser = WebBrowser(cookie_path)
|
||
self.browser_started = False
|
||
self.login_manager = None # 添加登录管理器实例
|
||
|
||
|
||
def start_browser(self):
|
||
"""启动浏览器"""
|
||
if not self.browser_started:
|
||
try:
|
||
self.browser.start_browser()
|
||
# 初始化登录管理器
|
||
self.login_manager = AiqichaLoginManager(self.browser)
|
||
|
||
# 加载cookies
|
||
if not self.browser.load_cookies():
|
||
print("未找到有效Cookie")
|
||
else:
|
||
print("已加载Cookie")
|
||
|
||
# # 使用登录管理器检测登录状态
|
||
# logined = self.login_manager.check_and_login()
|
||
# if logined:
|
||
# print("登录成功")
|
||
# else:
|
||
# print("登录失败")
|
||
self.browser_started = True
|
||
except Exception as e:
|
||
print(f"启动浏览器失败: {e}")
|
||
self.browser_started = False
|
||
|
||
def close_browser(self):
|
||
"""关闭浏览器"""
|
||
if self.browser_started:
|
||
try:
|
||
# 保存cookies
|
||
self.browser.save_cookies()
|
||
self.browser.close_browser()
|
||
except Exception as e:
|
||
print(f"关闭浏览器时出错: {e}")
|
||
finally:
|
||
self.browser_started = False
|
||
|
||
def close_svip_popups(self):
|
||
"""
|
||
关闭页面中的SVIP弹窗
|
||
"""
|
||
# dialog-close
|
||
try:
|
||
# 查找所有对话框关闭按钮元素
|
||
close_buttons = self.browser.page.query_selector_all('div.dialog-close')
|
||
print(f'找到 {len(close_buttons)} 个对话框关闭按钮')
|
||
|
||
# 遍历所有关闭按钮,点击可见的按钮
|
||
for i, button in enumerate(close_buttons):
|
||
if button and button.is_visible():
|
||
print(f"点击第 {i + 1} 个可见的对话框关闭按钮")
|
||
button.click()
|
||
time.sleep(1) # 等待对话框关闭
|
||
|
||
print("已完成关闭可见的对话框")
|
||
except Exception as e:
|
||
print(f"关闭对话框时出错: {e}")
|
||
|
||
if True:
|
||
return
|
||
|
||
try:
|
||
# 查找所有关闭按钮元素
|
||
# <a class="ivu-modal-close"><i class="ivu-icon ivu-icon-ios-close">
|
||
close_buttons = self.browser.page.query_selector_all('a.ivu-modal-close')
|
||
print(f'找到 {len(close_buttons)} 个关闭按钮')
|
||
|
||
# 遍历所有关闭按钮,点击可见的按钮
|
||
for i, button in enumerate(close_buttons):
|
||
if button and button.is_visible():
|
||
print(f"点击第 {i + 1} 个可见的关闭按钮")
|
||
button.click()
|
||
time.sleep(1) # 等待弹窗关闭
|
||
|
||
print("已完成关闭可见的模态框")
|
||
except Exception as e:
|
||
print(f"关闭弹窗时出错: {e}")
|
||
|
||
def crawl_company_detail(self, url: str, refer_url: str = None):
|
||
"""
|
||
爬取爱企查企业详情页数据
|
||
|
||
Args:
|
||
url (str): 企业详情页URL,例如 https://aiqicha.baidu.com/company_detail_45719927199916
|
||
|
||
Returns:
|
||
dict: 包含企业详细信息的字典
|
||
"""
|
||
if not self.browser_started:
|
||
self.start_browser()
|
||
|
||
if not self.browser_started:
|
||
print("浏览器未启动,无法执行爬取")
|
||
return {}
|
||
|
||
print(f'正在爬取企业详情: {url}')
|
||
|
||
try:
|
||
# 设置 Referer 头部模拟搜索引擎点击
|
||
if refer_url:
|
||
self.browser.page.set_extra_http_headers({"Referer": refer_url})
|
||
|
||
# 访问页面
|
||
if self.browser.visit_page(url):
|
||
|
||
self.close_svip_popups()
|
||
# 保存当前页面的html 到 demo/html/aiqicha-datail.html
|
||
# 目录不存在时创建
|
||
|
||
try:
|
||
self.browser.page.wait_for_selector('.header-user-center', timeout=10000)
|
||
print(f".header-user-center1: 等待页面元素ok")
|
||
|
||
except Exception as e:
|
||
print(f".header-user-center1: 等待页面元素时出错: {e}")
|
||
|
||
self.browser.save_page_html("demo/html/aiqicha-datail.html")
|
||
try:
|
||
# 增强页面加载检查
|
||
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
|
||
self.close_svip_popups()
|
||
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups.html")
|
||
|
||
# 使用登录管理器检测登录状态
|
||
login = self.login_manager.check_and_login()
|
||
if login:
|
||
print("crawl_company_detail:登录成功")
|
||
else:
|
||
print("crawl_company_detail:登录失败")
|
||
|
||
self.browser.save_page_html("demo/html/aiqicha-datail-afterchecklogin.html")
|
||
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
|
||
self.close_svip_popups()
|
||
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups2.html")
|
||
|
||
# 等待关键元素加载,增加超时时间
|
||
|
||
print("等待页面关键元素加载...")
|
||
try:
|
||
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
||
print("crawl_company_detail:.addr-enter-bg-ele success")
|
||
except Exception as e:
|
||
print(f"等待页面元素时出错: {e}")
|
||
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
|
||
print("关键元素已加载")
|
||
|
||
# 额外等待一段时间确保页面完全加载
|
||
import time
|
||
time.sleep(3)
|
||
|
||
print("额外等待完成,页面应该已完全加载")
|
||
except Exception as e:
|
||
print(f"等待页面元素时出错: {e}")
|
||
print("继续尝试解析页面内容...")
|
||
|
||
|
||
self.browser.save_cookies()
|
||
|
||
# 提取基本信息
|
||
print("开始解析页面信息...")
|
||
parser = AiqichaDetailParser(self.browser)
|
||
company_info = parser.parse_company_info()
|
||
|
||
print(f"成功爬取企业信息: {company_info['name']}")
|
||
return company_info
|
||
else:
|
||
print("访问页面失败")
|
||
return {}
|
||
except Exception as e:
|
||
print(f"爬取过程中出现错误: {e}")
|
||
return {}
|
||
|
||
|
||
def __enter__(self):
|
||
"""上下文管理器入口"""
|
||
self.start_browser()
|
||
return self
|
||
|
||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
"""上下文管理器出口"""
|
||
self.close_browser()
|
||
|
||
|
||
# 使用示例:
|
||
# 方法1: 手动管理浏览器生命周期
|
||
# crawler = QiqichaDetailCrawler()
|
||
# crawler.start_browser()
|
||
# detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916")
|
||
# crawler.close_browser()
|
||
|
||
# 方法2: 使用上下文管理器
|
||
# with QiqichaDetailCrawler() as crawler:
|
||
# detail = crawler.crawl_company_detail("https://aiqicha.baidu.com/company_detail_45719927199916")
|
||
# print(detail) |