Files
SearchCompany/crawler_campany_detail_by_data_csv.py
manchuwork 102dd78c26 aiqicha
2025-09-25 03:19:34 +08:00

114 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
from tool.csv_tool import CSVTool
from tool.aiqicha_detail import AiqichaDetailCrawler
import time
def query_init_company_data(csv_file_name):
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name=csv_file_name,
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
)
# 查询所有数据
all_data = csv_tool.get_all_data()
print("所有数据:", all_data)
return all_data
def crawl_and_save_aiqicha_details(input_csv, output_csv):
"""
从CSV文件中读取爱企查URL爬取企业详情并保存到新的CSV文件中
Args:
input_csv (str): 包含爱企查URL的输入CSV文件
output_csv (str): 保存企业详情的输出CSV文件
"""
# 读取输入数据
input_data = query_init_company_data(input_csv)
# 筛选出爱企查数据
aiqicha_data = [item for item in input_data if item['web_site_type'] == 'aiqicha']
print(f'找到 {len(aiqicha_data)} 条爱企查数据')
# 定义输出CSV的表头
output_headers = [
'company_name', # 公司名称
'credit_code', # 统一社会信用代码
'legal_representative', # 法定代表人
'registered_capital', # 注册资本
'establishment_date', # 成立日期
'business_status', # 经营状态
'address', # 公司地址
'business_scope', # 经营范围
'source_url', # 原始URL
'create_time' # 创建时间
]
# 创建输出CSV工具实例
output_csv_tool = CSVTool(
csv_file_name=output_csv,
headers=output_headers
)
# 使用爱企查详情爬虫
with AiqichaDetailCrawler() as crawler:
company_details = []
success_count = 0
for i, item in enumerate(aiqicha_data):
url = item['url']
refer_url: str = item['request_url']
print(f"正在处理: {url}")
# 爬取企业详情
detail = crawler.crawl_company_detail(url, refer_url)
if detail:
# 添加来源URL和公司名称
detail['source_url'] = url
# 转换字段名以匹配CSV表头
converted_item = {
'company_name': detail.get('name', ''),
'credit_code': detail.get('credit_code', ''),
'legal_representative': detail.get('legal_representative', ''),
'registered_capital': detail.get('registered_capital', ''),
'establishment_date': detail.get('establishment_date', ''),
'business_status': detail.get('business_status', ''),
'address': detail.get('address', ''),
'business_scope': detail.get('business_scope', ''),
'source_url': detail.get('source_url', '')
}
# 立即保存每条数据,避免数据丢失
written_count = output_csv_tool.save_data(
[converted_item],
unique_titles=['company_name'],
create_time=True
)
if written_count > 0:
success_count += 1
print(f"成功保存 {detail.get('name', '未知公司')} 的信息")
else:
print(f"保存 {detail.get('name', '未知公司')} 的信息失败(可能已存在)")
else:
print(f"获取 {url} 的信息失败")
# 添加延迟,避免请求过快
time.sleep(2)
next_sleep_interval = random.uniform(5, 15)
time.sleep(next_sleep_interval)
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
if __name__ == '__main__':
# 从原始搜索结果CSV中读取爱企查URL爬取详情并保存到新CSV文件
crawl_and_save_aiqicha_details('company_search_bing_data.csv', 'aiqicha_company_details.csv')
# 原有代码保留
# all_data = query_init_company_data('company_search_bing_data.csv')
# filter = [item for item in all_data if item['web_site_type'] == 'aiqicha']
# print('aiqicha数据:', filter)
# for item in filter:
# pass