SearchCompany/tool/aiqicha_detail_parser.py

# file: tool/aiqicha_detail_parser.py

import re


class AiqichaDetailParser:
    """爱企查企业详情页解析器"""

    def __init__(self, browser):
        """
        初始化解析器

        Args:
            browser: 浏览器页面对象
        """
        self.browser = browser
        # self.self.browser

    def parse_company_info(self):
        """
        解析页面中的企业基本信息，参考 AiQiChaParser 实现

        Returns:
            dict: 包含企业基本信息的字典
        """
        company_info = {}

        # 定义要提取的信息字段和对应的CSS选择器
        fields = {
            'name': ['.company-name', '.enterprise-name'],
            'credit_code': ['.credit-code', '.unified-social-credit-code'],
            'legal_representative': ['.legal-person', '.legal-representative'],
            'registered_capital': ['.reg-capital', '.registered-capital'],
            'establishment_date': ['.establishment-date', '.setup-date'],
            'business_status': ['.business-status', '.operating-state'],
            'address': ['.address', '.registered-address'],
            'business_scope': ['.business-scope', '.business-scope-content'],
            'company_type': ['.company-type', '.enterprise-type'],
            'industry': ['.industry', '.industry-category'],
            'registration_authority': ['.registration-authority', '.register-authority'],
            'operating_period': ['.operating-period', '.business-period'],
            'actual_capital': ['.actual-capital', '.paid-capital'],
            'taxpayer_id': ['.taxpayer-id', '.tax-id-number'],
            'organization_code': ['.organization-code'],
            'english_name': ['.english-name'],
            'approved_date': ['.approved-date', '.approval-date'],
            'staff_size': ['.staff-size', '.insured-persons'],
            'former_name': ['.former-name', '.previous-name']
        }

        # 批量提取信息
        # 爱企查页面使用表格结构，需要特殊处理
        field_mapping = {
                'legal_representative': '法定代表人',
                'business_scope': '经营范围',
                'credit_code': '统一社会信用代码',
                'registered_capital': '注册资本',
                'establishment_date': '成立日期',
                'business_status': '经营状态',
                'company_type': '企业类型',
                'registration_authority': '登记机关',
                'operating_period': '营业期限',
                'address': '注册地址',
                'administrative_division': '行政区划',
                'business_registration_number': '工商注册号',
                'taxpayer_qualification': '纳税人资质',
                'approved_date': '核准日期',
                'staff_size': '参保人数',
                'former_name': '曾用名'
        }
        for field_name, field_text in field_mapping.items():
            company_info[field_name] = self._extract_field_value(field_text)

        # 特殊处理电话号码
        company_info['phone'] = self._extract_phone_number()
        company_info['name'] = self._extract_company_name()
        return company_info

    def _extract_company_name(self):
        """
        提取企业名称

        Returns:
            str: 企业名称或"未知"
        """
        try:
            # 尝试多种方式获取企业名称
            selectors = [
                'title',  # 页面标题
                '.company-name',  # 常见的公司名称类
                'h1.enterprise-name',  # 企业名称标题
                '.company-title'  # 其他可能的类名
            ]

            for selector in selectors:
                try:
                    element = self.browser.page.query_selector(selector)
                    if element:
                        text = element.inner_text().strip()
                        # 如果是标题，可能需要去除后缀
                        if selector == 'title' and '-' in text:
                            text = text.split('-')[0].strip()
                        text = self._clean_text(text)
                        if text and text != "未知":
                            return text
                except:
                    continue
            return "未知"
        except Exception as e:
            print(f"提取企业名称时出错: {e}")
            return "未知"

    def _extract_field_value(self, field_text):
        """
        根据多个选择器提取字段值，适配爱企查实际页面结构
        """


        # for field_name, field_text in field_mapping.items():
        try:
            # 查找包含特定文本的td元素
            title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")')
            if title_element:
                # 获取相邻的td元素（包含实际值）
                value_element = title_element.evaluate_handle('el => el.nextElementSibling')
                if value_element:
                    text = value_element.inner_text().strip()
                    # 清理文本，移除前缀
                    if "：" in text:
                        text = text.split("：", 1)[1].strip()

                    # 特殊处理法定代表人字段，去除"TA有X家企业"等额外信息
                    if field_text == "法定代表人":
                        # 移除类似"TA有12家企业"的额外信息
                        text = re.sub(r'\s*TA有\d+家企业.*$', '', text)

                    # 特殊处理地址字段，去除"查看地图"等额外信息
                    if field_text == "注册地址":
                        # 移除"查看地图"等额外信息
                        text = re.sub(r'\s*查看地图.*$', '', text)
                        text = re.sub(r'\s*附近企业.*$', '', text)

                    # 特殊处理参保人数字段，仅保留数字
                    if field_text == "参保人数":
                        # 提取数字部分，如"7人" -> "7"
                        match = re.search(r'(\d+)', text)
                        if match:
                            text = match.group(1)

                    text = self._clean_text(text)
                    if text:
                        return text
        except Exception as e:
            print(f"提取字段 {field_text} 时出错: {e}")
            # continue

        return "未知"

    def _clean_text(self, text):
        """
        清理文本内容

        Args:
            text (str): 原始文本

        Returns:
            str: 清理后的文本
        """
        # 移除多余的空白字符
        text = re.sub(r'\s+', ' ', text)
        # 移除换行符和制表符
        text = re.sub(r'[\r\n\t]', '', text)
        return text.strip()

    def _extract_phone_number(self):
        """
        提取电话号码信息

        Returns:
            str: 电话号码或"未知"
        """
        try:
            # 查找电话信息容器
            phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap")
            if phone_container:
                # 查找包含电话号码的元素
                phone_element = phone_container.query_selector("span.copy-box span")
                if phone_element:
                    return self._clean_text(phone_element.inner_text())
        except Exception:
            pass
        return "未知"