This commit is contained in:
manchuwork
2025-10-04 01:19:56 +08:00
parent 6d42506d4e
commit ce414ffc51
5 changed files with 11969 additions and 11 deletions

1
cookies/.gitignore vendored
View File

@@ -1 +1,2 @@
* *
*.json

View File

@@ -85,6 +85,10 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
aiqicha_data = filtered_aiqicha_data aiqicha_data = filtered_aiqicha_data
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理') print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
if len(aiqicha_data) <= 0:
print("没有待处理的爱企查数据,退出")
return
# 使用爱企查详情爬虫 # 使用爱企查详情爬虫
with AiqichaDetailCrawler() as crawler: with AiqichaDetailCrawler() as crawler:
company_details = [] company_details = []
@@ -143,7 +147,7 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
# 添加延迟,避免请求过快 # 添加延迟,避免请求过快
time.sleep(2) time.sleep(2)
next_sleep_interval = random.uniform(5, 15) next_sleep_interval = random.uniform(3, 15)
time.sleep(next_sleep_interval) time.sleep(next_sleep_interval)
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}") print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")

1
data/.gitignore vendored
View File

@@ -1 +1,2 @@
* *
data.csv

11925
demo/aiqicha-datail.html Normal file

File diff suppressed because one or more lines are too long

View File

@@ -27,12 +27,12 @@ class AiqichaDetailCrawler:
else: else:
print("已加载Cookie") print("已加载Cookie")
# 使用登录管理器检测登录状态 # # 使用登录管理器检测登录状态
logined = self.login_manager.check_and_login() # logined = self.login_manager.check_and_login()
if logined: # if logined:
print("登录成功") # print("登录成功")
else: # else:
print("登录失败") # print("登录失败")
self.browser_started = True self.browser_started = True
except Exception as e: except Exception as e:
print(f"启动浏览器失败: {e}") print(f"启动浏览器失败: {e}")
@@ -80,12 +80,39 @@ class AiqichaDetailCrawler:
print("等待页面关键元素加载...") print("等待页面关键元素加载...")
try: try:
# 等待关键元素加载,增加超时时间 # 等待关键元素加载,增加超时时间
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=15000) try:
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
except Exception as e:
print(f"等待页面元素时出错: {e}")
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
print("关键元素已加载") print("关键元素已加载")
# 使用登录管理器检测登录状态
logined = self.login_manager.check_and_login()
if logined:
print("登录成功")
else:
print("登录失败")
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
try:
# 查找并点击关闭按钮
svip_modals = self.browser.page.query_selector_all('.clue-card-wrap')
for modal in svip_modals:
if modal:
# 查找.ivu-modal-close关闭按钮
close_button = modal.query_selector('a.ivu-modal-close')
if close_button:
close_button.click()
print("已关闭SVIP弹窗")
break
except Exception as e:
print(f"关闭弹窗时出错: {e}")
# 额外等待一段时间确保页面完全加载 # 额外等待一段时间确保页面完全加载
import time import time
time.sleep(10) time.sleep(3)
print("额外等待完成,页面应该已完全加载") print("额外等待完成,页面应该已完全加载")
except Exception as e: except Exception as e:
print(f"等待页面元素时出错: {e}") print(f"等待页面元素时出错: {e}")