This commit is contained in:
manchuwork
2025-09-25 14:47:19 +08:00
parent 54f3beded9
commit 6444fecd4e
85 changed files with 4892 additions and 63 deletions

View File

@@ -49,7 +49,30 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
csv_file_name=output_csv,
headers=output_headers
)
# 读取已有的企业名称,用于去重
existing_company_names = set()
try:
existing_data = output_csv_tool.get_all_data()
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
print(f"已存在 {len(existing_company_names)} 条企业数据")
except FileNotFoundError:
print(f"输出文件 {output_csv} 不存在,将创建新文件")
except Exception as e:
print(f"读取已有数据时出错: {e}")
# 过滤掉已存在的企业数据
filtered_aiqicha_data = []
for item in aiqicha_data:
company_name = item.get('company_name', '')
if company_name and company_name in existing_company_names:
print(f"跳过已存在的企业: {company_name}")
else:
filtered_aiqicha_data.append(item)
aiqicha_data = filtered_aiqicha_data
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
# 使用爱企查详情爬虫
with AiqichaDetailCrawler() as crawler:
company_details = []
@@ -101,10 +124,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
time.sleep(next_sleep_interval)
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
# crawler.browser.close_browser()
if __name__ == '__main__':
# 从原始搜索结果CSV中读取爱企查URL爬取详情并保存到新CSV文件
crawl_and_save_aiqicha_details('company_search_bing_data.csv', 'aiqicha_company_details.csv')
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
# 原有代码保留
# all_data = query_init_company_data('company_search_bing_data.csv')