This commit is contained in:
manchuwork
2025-10-03 03:02:27 +08:00
parent 6444fecd4e
commit 84143ff6fb
5 changed files with 1685 additions and 15 deletions

View File

@@ -373,8 +373,8 @@ if __name__ == '__main__':
# results2 = searcher.search("腾讯", 1)
# results3 = searcher.search("百度", 1)
sleep_time = 5
sleep_time += random.randint(3, 10)
sleep_time = 3
sleep_time += random.randint(1, 2)
time.sleep(sleep_time)
pass
pass

View File

@@ -41,7 +41,19 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
'address', # 公司地址
'business_scope', # 经营范围
'source_url', # 原始URL
'create_time' # 创建时间
'create_time' ,
# 新增字段
'company_type', # 企业类型
'industry', # 所属行业
'registration_authority', # 登记机关
'operating_period', # 营业期限
'actual_capital', # 实缴资本
'taxpayer_id', # 纳税人识别号
'organization_code', # 组织机构代码
'approved_date', # 核准日期
'staff_size', # 参保人数
'phone' # 电话
# 创建时间
]
# 创建输出CSV工具实例
@@ -99,7 +111,18 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
'business_status': detail.get('business_status', ''),
'address': detail.get('address', ''),
'business_scope': detail.get('business_scope', ''),
'source_url': detail.get('source_url', '')
'source_url': detail.get('source_url', ''),
# 新增字段映射
'company_type': detail.get('company_type', ''),
'industry': detail.get('industry', ''),
'registration_authority': detail.get('registration_authority', ''),
'operating_period': detail.get('operating_period', ''),
'actual_capital': detail.get('actual_capital', ''),
'taxpayer_id': detail.get('taxpayer_id', ''),
'organization_code': detail.get('organization_code', ''),
'approved_date': detail.get('approved_date', ''),
'staff_size': detail.get('staff_size', ''),
'phone': detail.get('phone', '')
}
# 立即保存每条数据,避免数据丢失

File diff suppressed because one or more lines are too long

View File

@@ -51,16 +51,22 @@ class AiqichaDetailParser:
# 批量提取信息
# 爱企查页面使用表格结构,需要特殊处理
field_mapping = {
'legal_representative': '法定代表人',
'business_scope': '经营范围',
'credit_code': '统一社会信用代码',
'registered_capital': '注册资本',
'establishment_date': '成立日期',
'business_status': '经营状态',
'company_type': '企业类型',
'registration_authority': '登记机关',
'operating_period': '营业期限',
'address': '注册地址'
'legal_representative': '法定代表人',
'business_scope': '经营范围',
'credit_code': '统一社会信用代码',
'registered_capital': '注册资本',
'establishment_date': '成立日期',
'business_status': '经营状态',
'company_type': '企业类型',
'registration_authority': '登记机关',
'operating_period': '营业期限',
'address': '注册地址',
'administrative_division': '行政区划',
'business_registration_number': '工商注册号',
'taxpayer_qualification': '纳税人资质',
'approved_date': '核准日期',
'staff_size': '参保人数',
'former_name': '曾用名'
}
for field_name, field_text in field_mapping.items():
company_info[field_name] = self._extract_field_value(field_text)