aiqicha
This commit is contained in:
@@ -15,6 +15,24 @@ def query_init_company_data(csv_file_name):
|
||||
print("所有数据:", all_data)
|
||||
return all_data
|
||||
|
||||
|
||||
def parse_operating_period(period_str):
|
||||
"""
|
||||
解析营业期限字符串,返回开始日期和结束日期
|
||||
示例输入:"2020-01-01至2030-12-31" 或 "2020-01-01起长期"
|
||||
"""
|
||||
if not period_str:
|
||||
return '', ''
|
||||
|
||||
if '至' in period_str:
|
||||
parts = period_str.split('至')
|
||||
return str(parts[0].strip()), str(parts[1].strip())
|
||||
elif '起' in period_str:
|
||||
parts = period_str.split('起')
|
||||
return str(parts[0].strip()), '长期'
|
||||
else:
|
||||
return str(period_str.strip()), ''
|
||||
|
||||
def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
"""
|
||||
从CSV文件中读取爱企查URL,爬取企业详情,并保存到新的CSV文件中
|
||||
@@ -41,18 +59,21 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
'address', # 公司地址
|
||||
'business_scope', # 经营范围
|
||||
'source_url', # 原始URL
|
||||
'create_time' ,
|
||||
|
||||
# 新增字段
|
||||
'company_type', # 企业类型
|
||||
'industry', # 所属行业
|
||||
'registration_authority', # 登记机关
|
||||
'operating_period', # 营业期限
|
||||
'operating_start_date', # 营业期限开始日期
|
||||
'operating_end_date', # 营业期限结束日期
|
||||
'actual_capital', # 实缴资本
|
||||
'taxpayer_id', # 纳税人识别号
|
||||
'organization_code', # 组织机构代码
|
||||
'approved_date', # 核准日期
|
||||
'staff_size', # 参保人数
|
||||
'phone' # 电话
|
||||
'phone', # 电话,
|
||||
'create_time',
|
||||
# 创建时间
|
||||
]
|
||||
|
||||
@@ -104,6 +125,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
|
||||
if detail:
|
||||
# 添加来源URL和公司名称
|
||||
|
||||
# 解析营业期限字段
|
||||
operating_period = detail.get('operating_period', '')
|
||||
start_date, end_date = parse_operating_period(operating_period)
|
||||
|
||||
detail['source_url'] = url
|
||||
# 转换字段名以匹配CSV表头
|
||||
converted_item = {
|
||||
@@ -121,6 +147,8 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
'industry': detail.get('industry', ''),
|
||||
'registration_authority': detail.get('registration_authority', ''),
|
||||
'operating_period': detail.get('operating_period', ''),
|
||||
'operating_start_date': str(start_date),
|
||||
'operating_end_date': str(end_date),
|
||||
'actual_capital': detail.get('actual_capital', ''),
|
||||
'taxpayer_id': detail.get('taxpayer_id', ''),
|
||||
'organization_code': detail.get('organization_code', ''),
|
||||
|
||||
Reference in New Issue
Block a user