138 lines
5.4 KiB
Python
138 lines
5.4 KiB
Python
import random
|
||
|
||
from tool.csv_tool import CSVTool
|
||
from tool.aiqicha_detail import AiqichaDetailCrawler
|
||
import time
|
||
|
||
def query_init_company_data(csv_file_name):
|
||
# 创建CSV工具实例
|
||
csv_tool = CSVTool(
|
||
csv_file_name=csv_file_name,
|
||
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
|
||
)
|
||
# 查询所有数据
|
||
all_data = csv_tool.get_all_data()
|
||
print("所有数据:", all_data)
|
||
return all_data
|
||
|
||
def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||
"""
|
||
从CSV文件中读取爱企查URL,爬取企业详情,并保存到新的CSV文件中
|
||
|
||
Args:
|
||
input_csv (str): 包含爱企查URL的输入CSV文件
|
||
output_csv (str): 保存企业详情的输出CSV文件
|
||
"""
|
||
# 读取输入数据
|
||
input_data = query_init_company_data(input_csv)
|
||
|
||
# 筛选出爱企查数据
|
||
aiqicha_data = [item for item in input_data if item['web_site_type'] == 'aiqicha']
|
||
print(f'找到 {len(aiqicha_data)} 条爱企查数据')
|
||
|
||
# 定义输出CSV的表头
|
||
output_headers = [
|
||
'company_name', # 公司名称
|
||
'credit_code', # 统一社会信用代码
|
||
'legal_representative', # 法定代表人
|
||
'registered_capital', # 注册资本
|
||
'establishment_date', # 成立日期
|
||
'business_status', # 经营状态
|
||
'address', # 公司地址
|
||
'business_scope', # 经营范围
|
||
'source_url', # 原始URL
|
||
'create_time' # 创建时间
|
||
]
|
||
|
||
# 创建输出CSV工具实例
|
||
output_csv_tool = CSVTool(
|
||
csv_file_name=output_csv,
|
||
headers=output_headers
|
||
)
|
||
|
||
# 读取已有的企业名称,用于去重
|
||
existing_company_names = set()
|
||
try:
|
||
existing_data = output_csv_tool.get_all_data()
|
||
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
|
||
print(f"已存在 {len(existing_company_names)} 条企业数据")
|
||
except FileNotFoundError:
|
||
print(f"输出文件 {output_csv} 不存在,将创建新文件")
|
||
except Exception as e:
|
||
print(f"读取已有数据时出错: {e}")
|
||
|
||
# 过滤掉已存在的企业数据
|
||
filtered_aiqicha_data = []
|
||
for item in aiqicha_data:
|
||
company_name = item.get('company_name', '')
|
||
if company_name and company_name in existing_company_names:
|
||
print(f"跳过已存在的企业: {company_name}")
|
||
else:
|
||
filtered_aiqicha_data.append(item)
|
||
|
||
aiqicha_data = filtered_aiqicha_data
|
||
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
|
||
|
||
# 使用爱企查详情爬虫
|
||
with AiqichaDetailCrawler() as crawler:
|
||
company_details = []
|
||
success_count = 0
|
||
|
||
for i, item in enumerate(aiqicha_data):
|
||
url = item['url']
|
||
refer_url: str = item['request_url']
|
||
print(f"正在处理: {url}")
|
||
|
||
# 爬取企业详情
|
||
detail = crawler.crawl_company_detail(url, refer_url)
|
||
|
||
if detail:
|
||
# 添加来源URL和公司名称
|
||
detail['source_url'] = url
|
||
# 转换字段名以匹配CSV表头
|
||
converted_item = {
|
||
'company_name': detail.get('name', ''),
|
||
'credit_code': detail.get('credit_code', ''),
|
||
'legal_representative': detail.get('legal_representative', ''),
|
||
'registered_capital': detail.get('registered_capital', ''),
|
||
'establishment_date': detail.get('establishment_date', ''),
|
||
'business_status': detail.get('business_status', ''),
|
||
'address': detail.get('address', ''),
|
||
'business_scope': detail.get('business_scope', ''),
|
||
'source_url': detail.get('source_url', '')
|
||
}
|
||
|
||
# 立即保存每条数据,避免数据丢失
|
||
written_count = output_csv_tool.save_data(
|
||
[converted_item],
|
||
unique_titles=['company_name'],
|
||
create_time=True
|
||
)
|
||
|
||
if written_count > 0:
|
||
success_count += 1
|
||
print(f"成功保存 {detail.get('name', '未知公司')} 的信息")
|
||
else:
|
||
print(f"保存 {detail.get('name', '未知公司')} 的信息失败(可能已存在)")
|
||
else:
|
||
print(f"获取 {url} 的信息失败")
|
||
|
||
# 添加延迟,避免请求过快
|
||
|
||
time.sleep(2)
|
||
next_sleep_interval = random.uniform(5, 15)
|
||
time.sleep(next_sleep_interval)
|
||
|
||
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
|
||
# crawler.browser.close_browser()
|
||
|
||
if __name__ == '__main__':
|
||
# 从原始搜索结果CSV中读取爱企查URL,爬取详情并保存到新CSV文件
|
||
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
|
||
|
||
# 原有代码保留
|
||
# all_data = query_init_company_data('company_search_bing_data.csv')
|
||
# filter = [item for item in all_data if item['web_site_type'] == 'aiqicha']
|
||
# print('aiqicha数据:', filter)
|
||
# for item in filter:
|
||
# pass |