Files
SearchCompany/tool/aiqicha_detail_parser.py
manchuwork de3c97e828 aiqicha
2025-11-03 18:57:58 +08:00

193 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# file: tool/aiqicha_detail_parser.py
import re
class AiqichaDetailParser:
"""爱企查企业详情页解析器"""
def __init__(self, browser):
"""
初始化解析器
Args:
browser: 浏览器页面对象
"""
self.browser = browser
# self.self.browser
def parse_company_info(self):
"""
解析页面中的企业基本信息,参考 AiQiChaParser 实现
Returns:
dict: 包含企业基本信息的字典
"""
company_info = {}
# 定义要提取的信息字段和对应的CSS选择器
fields = {
'name': ['.company-name', '.enterprise-name'],
'credit_code': ['.credit-code', '.unified-social-credit-code'],
'legal_representative': ['.legal-person', '.legal-representative'],
'registered_capital': ['.reg-capital', '.registered-capital'],
'establishment_date': ['.establishment-date', '.setup-date'],
'business_status': ['.business-status', '.operating-state'],
'address': ['.address', '.registered-address'],
'business_scope': ['.business-scope', '.business-scope-content'],
'company_type': ['.company-type', '.enterprise-type'],
'industry': ['.industry', '.industry-category'],
'registration_authority': ['.registration-authority', '.register-authority'],
'operating_period': ['.operating-period', '.business-period'],
'actual_capital': ['.actual-capital', '.paid-capital'],
'taxpayer_id': ['.taxpayer-id', '.tax-id-number'],
'organization_code': ['.organization-code'],
'english_name': ['.english-name'],
'approved_date': ['.approved-date', '.approval-date'],
'staff_size': ['.staff-size', '.insured-persons'],
'former_name': ['.former-name', '.previous-name']
}
# 批量提取信息
# 爱企查页面使用表格结构,需要特殊处理
field_mapping = {
'legal_representative': '法定代表人',
'business_scope': '经营范围',
'credit_code': '统一社会信用代码',
'registered_capital': '注册资本',
'establishment_date': '成立日期',
'business_status': '经营状态',
'company_type': '企业类型',
'registration_authority': '登记机关',
'operating_period': '营业期限',
'address': '注册地址',
'administrative_division': '行政区划',
'business_registration_number': '工商注册号',
'taxpayer_qualification': '纳税人资质',
'approved_date': '核准日期',
'staff_size': '参保人数',
'former_name': '曾用名'
}
for field_name, field_text in field_mapping.items():
company_info[field_name] = self._extract_field_value(field_text)
# 特殊处理电话号码
company_info['phone'] = self._extract_phone_number()
company_info['name'] = self._extract_company_name()
return company_info
def _extract_company_name(self):
"""
提取企业名称
Returns:
str: 企业名称或"未知"
"""
try:
# 尝试多种方式获取企业名称
selectors = [
'title', # 页面标题
'.company-name', # 常见的公司名称类
'h1.enterprise-name', # 企业名称标题
'.company-title' # 其他可能的类名
]
for selector in selectors:
try:
element = self.browser.page.query_selector(selector)
if element:
text = element.inner_text().strip()
# 如果是标题,可能需要去除后缀
if selector == 'title' and '-' in text:
text = text.split('-')[0].strip()
text = self._clean_text(text)
if text and text != "未知":
return text
except:
continue
return "未知"
except Exception as e:
print(f"提取企业名称时出错: {e}")
return "未知"
def _extract_field_value(self, field_text):
"""
根据多个选择器提取字段值,适配爱企查实际页面结构
"""
# for field_name, field_text in field_mapping.items():
try:
# 查找包含特定文本的td元素
title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")')
if title_element:
# 获取相邻的td元素包含实际值
value_element = title_element.evaluate_handle('el => el.nextElementSibling')
if value_element:
text = value_element.inner_text().strip()
# 清理文本,移除前缀
if "" in text:
text = text.split("", 1)[1].strip()
# 特殊处理法定代表人字段,去除"TA有X家企业"等额外信息
if field_text == "法定代表人":
# 移除类似"TA有12家企业"的额外信息
text = re.sub(r'\s*TA有\d+家企业.*$', '', text)
# 特殊处理地址字段,去除"查看地图"等额外信息
if field_text == "注册地址":
# 移除"查看地图"等额外信息
text = re.sub(r'\s*查看地图.*$', '', text)
text = re.sub(r'\s*附近企业.*$', '', text)
# 特殊处理参保人数字段,仅保留数字
if field_text == "参保人数":
# 提取数字部分,如"7人" -> "7"
match = re.search(r'(\d+)', text)
if match:
text = match.group(1)
text = self._clean_text(text)
if text:
return text
except Exception as e:
print(f"提取字段 {field_text} 时出错: {e}")
# continue
return "未知"
def _clean_text(self, text):
"""
清理文本内容
Args:
text (str): 原始文本
Returns:
str: 清理后的文本
"""
# 移除多余的空白字符
text = re.sub(r'\s+', ' ', text)
# 移除换行符和制表符
text = re.sub(r'[\r\n\t]', '', text)
return text.strip()
def _extract_phone_number(self):
"""
提取电话号码信息
Returns:
str: 电话号码或"未知"
"""
try:
# 查找电话信息容器
phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap")
if phone_container:
# 查找包含电话号码的元素
phone_element = phone_container.query_selector("span.copy-box span")
if phone_element:
return self._clean_text(phone_element.inner_text())
except Exception:
pass
return "未知"