Files
SearchCompany/crawler_campany_detail_by_data_csv.py
manchuwork de3c97e828 aiqicha
2025-11-03 18:57:58 +08:00

193 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
from tool.csv_tool import CSVTool
from tool.aiqicha_detail import AiqichaDetailCrawler
import time
def query_init_company_data(csv_file_name):
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name=csv_file_name,
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
)
# 查询所有数据
all_data = csv_tool.get_all_data()
print("所有数据:", all_data)
return all_data
def parse_operating_period(period_str):
"""
解析营业期限字符串,返回开始日期和结束日期
示例输入:"2020-01-01至2030-12-31""2020-01-01起长期"
"""
if not period_str:
return '', ''
if '' in period_str:
parts = period_str.split('')
return str(parts[0].strip()), str(parts[1].strip())
elif '' in period_str:
parts = period_str.split('')
return str(parts[0].strip()), '长期'
else:
return str(period_str.strip()), ''
def crawl_and_save_aiqicha_details(input_csv, output_csv):
"""
从CSV文件中读取爱企查URL爬取企业详情并保存到新的CSV文件中
Args:
input_csv (str): 包含爱企查URL的输入CSV文件
output_csv (str): 保存企业详情的输出CSV文件
"""
# 读取输入数据
input_data = query_init_company_data(input_csv)
# 筛选出爱企查数据
aiqicha_data = [item for item in input_data if item['web_site_type'] == 'aiqicha']
print(f'找到 {len(aiqicha_data)} 条爱企查数据')
# 定义输出CSV的表头
output_headers = [
'company_name', # 公司名称
'credit_code', # 统一社会信用代码
'legal_representative', # 法定代表人
'registered_capital', # 注册资本
'establishment_date', # 成立日期
'business_status', # 经营状态
'address', # 公司地址
'business_scope', # 经营范围
'source_url', # 原始URL
# 新增字段
'company_type', # 企业类型
'industry', # 所属行业
'registration_authority', # 登记机关
'operating_period', # 营业期限
'operating_start_date', # 营业期限开始日期
'operating_end_date', # 营业期限结束日期
'actual_capital', # 实缴资本
'taxpayer_id', # 纳税人识别号
'organization_code', # 组织机构代码
'approved_date', # 核准日期
'staff_size', # 参保人数
'phone', # 电话,
'create_time',
# 创建时间
]
# 创建输出CSV工具实例
output_csv_tool = CSVTool(
csv_file_name=output_csv,
headers=output_headers
)
# 读取已有的企业名称,用于去重
existing_company_names = set()
try:
existing_data = output_csv_tool.get_all_data()
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
print(f"已存在 {len(existing_company_names)} 条企业数据")
except FileNotFoundError:
print(f"输出文件 {output_csv} 不存在,将创建新文件")
except Exception as e:
print(f"读取已有数据时出错: {e}")
# 过滤掉已存在的企业数据
filtered_aiqicha_data = []
for item in aiqicha_data:
company_name = item.get('company_name', '')
if company_name and company_name in existing_company_names:
print(f"跳过已存在的企业: {company_name}")
else:
filtered_aiqicha_data.append(item)
aiqicha_data = filtered_aiqicha_data
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
if len(aiqicha_data) <= 0:
print("没有待处理的爱企查数据,退出")
return
# 使用爱企查详情爬虫
with AiqichaDetailCrawler() as crawler:
company_details = []
success_count = 0
for i, item in enumerate(aiqicha_data):
url = item['url']
refer_url: str = item['request_url']
print(f"正在处理: {url}")
# 爬取企业详情
detail = crawler.crawl_company_detail(url, refer_url)
if detail:
# 添加来源URL和公司名称
# 解析营业期限字段
operating_period = detail.get('operating_period', '')
start_date, end_date = parse_operating_period(operating_period)
detail['source_url'] = url
# 转换字段名以匹配CSV表头
converted_item = {
'company_name': detail.get('name', ''),
'credit_code': detail.get('credit_code', ''),
'legal_representative': detail.get('legal_representative', ''),
'registered_capital': detail.get('registered_capital', ''),
'establishment_date': detail.get('establishment_date', ''),
'business_status': detail.get('business_status', ''),
'address': detail.get('address', ''),
'business_scope': detail.get('business_scope', ''),
'source_url': detail.get('source_url', ''),
# 新增字段映射
'company_type': detail.get('company_type', ''),
'industry': detail.get('industry', ''),
'registration_authority': detail.get('registration_authority', ''),
'operating_period': detail.get('operating_period', ''),
'operating_start_date': str(start_date),
'operating_end_date': str(end_date),
'actual_capital': detail.get('actual_capital', ''),
'taxpayer_id': detail.get('taxpayer_id', ''),
'organization_code': detail.get('organization_code', ''),
'approved_date': detail.get('approved_date', ''),
'staff_size': detail.get('staff_size', ''),
'phone': detail.get('phone', '')
}
# 立即保存每条数据,避免数据丢失
written_count = output_csv_tool.save_data(
[converted_item],
unique_titles=['company_name'],
create_time=True
)
if written_count > 0:
success_count += 1
print(f"成功保存 {detail.get('name', '未知公司')} 的信息")
else:
print(f"保存 {detail.get('name', '未知公司')} 的信息失败(可能已存在)")
else:
print(f"获取 {url} 的信息失败")
# 添加延迟,避免请求过快
time.sleep(2)
next_sleep_interval = random.uniform(3, 15)
time.sleep(next_sleep_interval)
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
# crawler.browser.close_browser()
if __name__ == '__main__':
# 从原始搜索结果CSV中读取爱企查URL爬取详情并保存到新CSV文件
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
# 原有代码保留
# all_data = query_init_company_data('company_search_bing_data.csv')
# filter = [item for item in all_data if item['web_site_type'] == 'aiqicha']
# print('aiqicha数据:', filter)
# for item in filter:
# pass