180 lines
6.7 KiB
Python
180 lines
6.7 KiB
Python
# file: tool/aiqicha_detail_parser.py
|
||
|
||
import re
|
||
|
||
|
||
class AiqichaDetailParser:
|
||
"""爱企查企业详情页解析器"""
|
||
|
||
def __init__(self, browser):
|
||
"""
|
||
初始化解析器
|
||
|
||
Args:
|
||
browser: 浏览器页面对象
|
||
"""
|
||
self.browser = browser
|
||
# self.self.browser
|
||
|
||
def parse_company_info(self):
|
||
"""
|
||
解析页面中的企业基本信息,参考 AiQiChaParser 实现
|
||
|
||
Returns:
|
||
dict: 包含企业基本信息的字典
|
||
"""
|
||
company_info = {}
|
||
|
||
# 定义要提取的信息字段和对应的CSS选择器
|
||
fields = {
|
||
'name': ['.company-name', '.enterprise-name'],
|
||
'credit_code': ['.credit-code', '.unified-social-credit-code'],
|
||
'legal_representative': ['.legal-person', '.legal-representative'],
|
||
'registered_capital': ['.reg-capital', '.registered-capital'],
|
||
'establishment_date': ['.establishment-date', '.setup-date'],
|
||
'business_status': ['.business-status', '.operating-state'],
|
||
'address': ['.address', '.registered-address'],
|
||
'business_scope': ['.business-scope', '.business-scope-content'],
|
||
'company_type': ['.company-type', '.enterprise-type'],
|
||
'industry': ['.industry', '.industry-category'],
|
||
'registration_authority': ['.registration-authority', '.register-authority'],
|
||
'operating_period': ['.operating-period', '.business-period'],
|
||
'actual_capital': ['.actual-capital', '.paid-capital'],
|
||
'taxpayer_id': ['.taxpayer-id', '.tax-id-number'],
|
||
'organization_code': ['.organization-code'],
|
||
'english_name': ['.english-name'],
|
||
'approved_date': ['.approved-date', '.approval-date'],
|
||
'staff_size': ['.staff-size', '.insured-persons'],
|
||
'former_name': ['.former-name', '.previous-name']
|
||
}
|
||
|
||
# 批量提取信息
|
||
# 爱企查页面使用表格结构,需要特殊处理
|
||
field_mapping = {
|
||
'legal_representative': '法定代表人',
|
||
'business_scope': '经营范围',
|
||
'credit_code': '统一社会信用代码',
|
||
'registered_capital': '注册资本',
|
||
'establishment_date': '成立日期',
|
||
'business_status': '经营状态',
|
||
'company_type': '企业类型',
|
||
'registration_authority': '登记机关',
|
||
'operating_period': '营业期限',
|
||
'address': '注册地址'
|
||
}
|
||
for field_name, field_text in field_mapping.items():
|
||
company_info[field_name] = self._extract_field_value(field_text)
|
||
|
||
# 特殊处理电话号码
|
||
company_info['phone'] = self._extract_phone_number()
|
||
company_info['name'] = self._extract_company_name()
|
||
return company_info
|
||
|
||
def _extract_company_name(self):
|
||
"""
|
||
提取企业名称
|
||
|
||
Returns:
|
||
str: 企业名称或"未知"
|
||
"""
|
||
try:
|
||
# 尝试多种方式获取企业名称
|
||
selectors = [
|
||
'title', # 页面标题
|
||
'.company-name', # 常见的公司名称类
|
||
'h1.enterprise-name', # 企业名称标题
|
||
'.company-title' # 其他可能的类名
|
||
]
|
||
|
||
for selector in selectors:
|
||
try:
|
||
element = self.browser.page.query_selector(selector)
|
||
if element:
|
||
text = element.inner_text().strip()
|
||
# 如果是标题,可能需要去除后缀
|
||
if selector == 'title' and '-' in text:
|
||
text = text.split('-')[0].strip()
|
||
text = self._clean_text(text)
|
||
if text and text != "未知":
|
||
return text
|
||
except:
|
||
continue
|
||
return "未知"
|
||
except Exception as e:
|
||
print(f"提取企业名称时出错: {e}")
|
||
return "未知"
|
||
|
||
def _extract_field_value(self, field_text):
|
||
"""
|
||
根据多个选择器提取字段值,适配爱企查实际页面结构
|
||
"""
|
||
|
||
|
||
# for field_name, field_text in field_mapping.items():
|
||
try:
|
||
# 查找包含特定文本的td元素
|
||
title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")')
|
||
if title_element:
|
||
# 获取相邻的td元素(包含实际值)
|
||
value_element = title_element.evaluate_handle('el => el.nextElementSibling')
|
||
if value_element:
|
||
text = value_element.inner_text().strip()
|
||
# 清理文本,移除前缀
|
||
if ":" in text:
|
||
text = text.split(":", 1)[1].strip()
|
||
|
||
# 特殊处理法定代表人字段,去除"TA有X家企业"等额外信息
|
||
if field_text == "法定代表人":
|
||
# 移除类似"TA有12家企业"的额外信息
|
||
text = re.sub(r'\s*TA有\d+家企业.*$', '', text)
|
||
|
||
# 特殊处理地址字段,去除"查看地图"等额外信息
|
||
if field_text == "注册地址":
|
||
# 移除"查看地图"等额外信息
|
||
text = re.sub(r'\s*查看地图.*$', '', text)
|
||
text = re.sub(r'\s*附近企业.*$', '', text)
|
||
|
||
text = self._clean_text(text)
|
||
if text:
|
||
return text
|
||
except Exception as e:
|
||
print(f"提取字段 {field_text} 时出错: {e}")
|
||
# continue
|
||
|
||
return "未知"
|
||
|
||
def _clean_text(self, text):
|
||
"""
|
||
清理文本内容
|
||
|
||
Args:
|
||
text (str): 原始文本
|
||
|
||
Returns:
|
||
str: 清理后的文本
|
||
"""
|
||
# 移除多余的空白字符
|
||
text = re.sub(r'\s+', ' ', text)
|
||
# 移除换行符和制表符
|
||
text = re.sub(r'[\r\n\t]', '', text)
|
||
return text.strip()
|
||
|
||
def _extract_phone_number(self):
|
||
"""
|
||
提取电话号码信息
|
||
|
||
Returns:
|
||
str: 电话号码或"未知"
|
||
"""
|
||
try:
|
||
# 查找电话信息容器
|
||
phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap")
|
||
if phone_container:
|
||
# 查找包含电话号码的元素
|
||
phone_element = phone_container.query_selector("span.copy-box span")
|
||
if phone_element:
|
||
return self._clean_text(phone_element.inner_text())
|
||
except Exception:
|
||
pass
|
||
return "未知"
|