aiqicha
This commit is contained in:
@@ -49,7 +49,30 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
csv_file_name=output_csv,
|
||||
headers=output_headers
|
||||
)
|
||||
|
||||
|
||||
# 读取已有的企业名称,用于去重
|
||||
existing_company_names = set()
|
||||
try:
|
||||
existing_data = output_csv_tool.get_all_data()
|
||||
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
|
||||
print(f"已存在 {len(existing_company_names)} 条企业数据")
|
||||
except FileNotFoundError:
|
||||
print(f"输出文件 {output_csv} 不存在,将创建新文件")
|
||||
except Exception as e:
|
||||
print(f"读取已有数据时出错: {e}")
|
||||
|
||||
# 过滤掉已存在的企业数据
|
||||
filtered_aiqicha_data = []
|
||||
for item in aiqicha_data:
|
||||
company_name = item.get('company_name', '')
|
||||
if company_name and company_name in existing_company_names:
|
||||
print(f"跳过已存在的企业: {company_name}")
|
||||
else:
|
||||
filtered_aiqicha_data.append(item)
|
||||
|
||||
aiqicha_data = filtered_aiqicha_data
|
||||
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
|
||||
|
||||
# 使用爱企查详情爬虫
|
||||
with AiqichaDetailCrawler() as crawler:
|
||||
company_details = []
|
||||
@@ -101,10 +124,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
time.sleep(next_sleep_interval)
|
||||
|
||||
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
|
||||
# crawler.browser.close_browser()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 从原始搜索结果CSV中读取爱企查URL,爬取详情并保存到新CSV文件
|
||||
crawl_and_save_aiqicha_details('company_search_bing_data.csv', 'aiqicha_company_details.csv')
|
||||
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
|
||||
|
||||
# 原有代码保留
|
||||
# all_data = query_init_company_data('company_search_bing_data.csv')
|
||||
|
||||
Reference in New Issue
Block a user