# file: tool/aiqicha_detail_parser.py import re class AiqichaDetailParser: """爱企查企业详情页解析器""" def __init__(self, browser): """ 初始化解析器 Args: browser: 浏览器页面对象 """ self.browser = browser # self.self.browser def parse_company_info(self): """ 解析页面中的企业基本信息,参考 AiQiChaParser 实现 Returns: dict: 包含企业基本信息的字典 """ company_info = {} # 定义要提取的信息字段和对应的CSS选择器 fields = { 'name': ['.company-name', '.enterprise-name'], 'credit_code': ['.credit-code', '.unified-social-credit-code'], 'legal_representative': ['.legal-person', '.legal-representative'], 'registered_capital': ['.reg-capital', '.registered-capital'], 'establishment_date': ['.establishment-date', '.setup-date'], 'business_status': ['.business-status', '.operating-state'], 'address': ['.address', '.registered-address'], 'business_scope': ['.business-scope', '.business-scope-content'], 'company_type': ['.company-type', '.enterprise-type'], 'industry': ['.industry', '.industry-category'], 'registration_authority': ['.registration-authority', '.register-authority'], 'operating_period': ['.operating-period', '.business-period'], 'actual_capital': ['.actual-capital', '.paid-capital'], 'taxpayer_id': ['.taxpayer-id', '.tax-id-number'], 'organization_code': ['.organization-code'], 'english_name': ['.english-name'], 'approved_date': ['.approved-date', '.approval-date'], 'staff_size': ['.staff-size', '.insured-persons'], 'former_name': ['.former-name', '.previous-name'] } # 批量提取信息 # 爱企查页面使用表格结构,需要特殊处理 field_mapping = { 'legal_representative': '法定代表人', 'business_scope': '经营范围', 'credit_code': '统一社会信用代码', 'registered_capital': '注册资本', 'establishment_date': '成立日期', 'business_status': '经营状态', 'company_type': '企业类型', 'registration_authority': '登记机关', 'operating_period': '营业期限', 'address': '注册地址', 'administrative_division': '行政区划', 'business_registration_number': '工商注册号', 'taxpayer_qualification': '纳税人资质', 'approved_date': '核准日期', 'staff_size': '参保人数', 'former_name': '曾用名' } for field_name, field_text in field_mapping.items(): company_info[field_name] = self._extract_field_value(field_text) # 特殊处理电话号码 company_info['phone'] = self._extract_phone_number() company_info['name'] = self._extract_company_name() return company_info def _extract_company_name(self): """ 提取企业名称 Returns: str: 企业名称或"未知" """ try: # 尝试多种方式获取企业名称 selectors = [ 'title', # 页面标题 '.company-name', # 常见的公司名称类 'h1.enterprise-name', # 企业名称标题 '.company-title' # 其他可能的类名 ] for selector in selectors: try: element = self.browser.page.query_selector(selector) if element: text = element.inner_text().strip() # 如果是标题,可能需要去除后缀 if selector == 'title' and '-' in text: text = text.split('-')[0].strip() text = self._clean_text(text) if text and text != "未知": return text except: continue return "未知" except Exception as e: print(f"提取企业名称时出错: {e}") return "未知" def _extract_field_value(self, field_text): """ 根据多个选择器提取字段值,适配爱企查实际页面结构 """ # for field_name, field_text in field_mapping.items(): try: # 查找包含特定文本的td元素 title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")') if title_element: # 获取相邻的td元素(包含实际值) value_element = title_element.evaluate_handle('el => el.nextElementSibling') if value_element: text = value_element.inner_text().strip() # 清理文本,移除前缀 if ":" in text: text = text.split(":", 1)[1].strip() # 特殊处理法定代表人字段,去除"TA有X家企业"等额外信息 if field_text == "法定代表人": # 移除类似"TA有12家企业"的额外信息 text = re.sub(r'\s*TA有\d+家企业.*$', '', text) # 特殊处理地址字段,去除"查看地图"等额外信息 if field_text == "注册地址": # 移除"查看地图"等额外信息 text = re.sub(r'\s*查看地图.*$', '', text) text = re.sub(r'\s*附近企业.*$', '', text) # 特殊处理参保人数字段,仅保留数字 if field_text == "参保人数": # 提取数字部分,如"7人" -> "7" match = re.search(r'(\d+)', text) if match: text = match.group(1) text = self._clean_text(text) if text: return text except Exception as e: print(f"提取字段 {field_text} 时出错: {e}") # continue return "未知" def _clean_text(self, text): """ 清理文本内容 Args: text (str): 原始文本 Returns: str: 清理后的文本 """ # 移除多余的空白字符 text = re.sub(r'\s+', ' ', text) # 移除换行符和制表符 text = re.sub(r'[\r\n\t]', '', text) return text.strip() def _extract_phone_number(self): """ 提取电话号码信息 Returns: str: 电话号码或"未知" """ try: # 查找电话信息容器 phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap") if phone_container: # 查找包含电话号码的元素 phone_element = phone_container.query_selector("span.copy-box span") if phone_element: return self._clean_text(phone_element.inner_text()) except Exception: pass return "未知"