aiqicha
This commit is contained in:
2
.idea/SearchCompany.iml
generated
2
.idea/SearchCompany.iml
generated
@@ -4,7 +4,7 @@
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.13 (SearchCompany)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
1
.idea/misc.xml
generated
1
.idea/misc.xml
generated
@@ -3,4 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.13 (SearchCompany)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
1
company/auth_state.json
Normal file
1
company/auth_state.json
Normal file
@@ -0,0 +1 @@
|
||||
{}
|
||||
@@ -294,7 +294,7 @@ def check_company_exists(company_names, type_list):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
reader = CSVReader('data.csv')
|
||||
reader = CSVReader('data/data.csv')
|
||||
company_names = reader.read_column(0, has_header=False)
|
||||
print("所有数据:", company_names)
|
||||
|
||||
@@ -307,7 +307,7 @@ if __name__ == '__main__':
|
||||
with BingSearcher() as searcher:
|
||||
# 创建CSV工具实例
|
||||
csv_tool = CSVTool(
|
||||
csv_file_name='company_search_bing_data.csv',
|
||||
csv_file_name='data/company_search_bing_data.csv',
|
||||
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
|
||||
)
|
||||
# 查询所有数据
|
||||
@@ -357,7 +357,7 @@ if __name__ == '__main__':
|
||||
else:
|
||||
# 创建CSV工具实例
|
||||
csv_tool = CSVTool(
|
||||
csv_file_name='company_search_bing_data.csv',
|
||||
csv_file_name='data/company_search_bing_data.csv',
|
||||
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
|
||||
)
|
||||
|
||||
@@ -49,7 +49,30 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
csv_file_name=output_csv,
|
||||
headers=output_headers
|
||||
)
|
||||
|
||||
|
||||
# 读取已有的企业名称,用于去重
|
||||
existing_company_names = set()
|
||||
try:
|
||||
existing_data = output_csv_tool.get_all_data()
|
||||
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
|
||||
print(f"已存在 {len(existing_company_names)} 条企业数据")
|
||||
except FileNotFoundError:
|
||||
print(f"输出文件 {output_csv} 不存在,将创建新文件")
|
||||
except Exception as e:
|
||||
print(f"读取已有数据时出错: {e}")
|
||||
|
||||
# 过滤掉已存在的企业数据
|
||||
filtered_aiqicha_data = []
|
||||
for item in aiqicha_data:
|
||||
company_name = item.get('company_name', '')
|
||||
if company_name and company_name in existing_company_names:
|
||||
print(f"跳过已存在的企业: {company_name}")
|
||||
else:
|
||||
filtered_aiqicha_data.append(item)
|
||||
|
||||
aiqicha_data = filtered_aiqicha_data
|
||||
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
|
||||
|
||||
# 使用爱企查详情爬虫
|
||||
with AiqichaDetailCrawler() as crawler:
|
||||
company_details = []
|
||||
@@ -101,10 +124,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
|
||||
time.sleep(next_sleep_interval)
|
||||
|
||||
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
|
||||
# crawler.browser.close_browser()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 从原始搜索结果CSV中读取爱企查URL,爬取详情并保存到新CSV文件
|
||||
crawl_and_save_aiqicha_details('company_search_bing_data.csv', 'aiqicha_company_details.csv')
|
||||
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
|
||||
|
||||
# 原有代码保留
|
||||
# all_data = query_init_company_data('company_search_bing_data.csv')
|
||||
|
||||
47
data/data.csv
Normal file
47
data/data.csv
Normal file
@@ -0,0 +1,47 @@
|
||||
杭州元氪科技有限公司
|
||||
杭州华立创客社区管理有限公司
|
||||
杭州迅能贸易有限公司
|
||||
杭州栖梦工坊科技服务有限公司
|
||||
瑞幸咖啡(杭州)有限公司
|
||||
杭州余杭区五常街道莱欧烘焙店
|
||||
杭州余杭融昭文化创意工作室(个体工商户)
|
||||
杭州余杭区五常街道梓鑫水果店
|
||||
杭州三乐进出口有限公司
|
||||
杭州余杭区五常街道慧欣图文设计工作室
|
||||
杭州质享电子商务有限责任公司
|
||||
杰茜荟乳业(上海)有限公司杭州余杭科技分公司
|
||||
浙江正播影视文化传媒有限公司
|
||||
杭州品思电子商务有限公司
|
||||
杭州芯桥智联科技有限公司
|
||||
杭州帛蔻进出口有限公司
|
||||
杭州佰磊斯科技有限公司
|
||||
杭州括号传媒有限公司
|
||||
杭州赤骥贸易有限公司
|
||||
杭州键嘉医疗科技股份有限公司
|
||||
阳光财产保险股份有限公司杭州中心支公司
|
||||
杭州智风科技有限公司
|
||||
杭州勇达检测技术有限公司
|
||||
杭州初米网络技术有限公司
|
||||
杭州和辰电力科技有限公司
|
||||
杭州润州光电技术有限公司
|
||||
杭州君莱通信科技有限公司
|
||||
杭州余杭图王广告设计工作室(个体工商户)
|
||||
杭州润光软件技术有限公司
|
||||
杭州易光科技有限公司
|
||||
杭州奇课文化传媒有限公司
|
||||
杭州柯拉科技有限公司
|
||||
杭州备胎说车科技有限公司
|
||||
浙江绿色共享教育基金会
|
||||
杭州爱维因健康科技有限公司
|
||||
浙江润影医疗科技有限公司
|
||||
杭州花蜂科技有限公司
|
||||
连云港金康和信药业有限公司
|
||||
杭州立镖信息科技有限公司
|
||||
杭州番石榴供应链管理有限公司
|
||||
杭州昇辉生物技术有限公司
|
||||
湖州益荣服饰有限公司
|
||||
杭州顾嘉网络科技有限公司
|
||||
厨何以(杭州)电子商务有限公司
|
||||
杭州优冠商贸有限公司
|
||||
永馨智慧科技(杭州)有限公司
|
||||
杭州钱唐隆腾医疗技术有限公司
|
||||
|
@@ -1,7 +1,7 @@
|
||||
杭州辉煌物业管理有限公司
|
||||
杭州辉望科技有限公司
|
||||
浙江八方电信科技集团有限公司
|
||||
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)
|
||||
中国移动通信集团浙江有限公司余杭分公司西溪八方城
|
||||
瑞凤九天(杭州)科技有限公司
|
||||
金码智能科技(杭州)有限公司
|
||||
OPPO广东移动通信有限公司
|
||||
@@ -15,7 +15,7 @@ OPPO广东移动通信有限公司
|
||||
杭州钱橙似锦科技有限公司
|
||||
杭州奥得徕贸易有限公司
|
||||
杭州伍壹荟旅游咨资询有限公司
|
||||
杭州心满意定供应链服务有限公司
|
||||
杭州心满意足供应链服务有限公司
|
||||
杭州麒晨科技有限公司
|
||||
杭州羊咩咩文化传媒有限公司
|
||||
杭州禾露则正生物科技有限公司
|
||||
@@ -30,7 +30,7 @@ OPPO广东移动通信有限公司
|
||||
杭州裕阳经营管理合伙企业(有限合伙)
|
||||
氧气.康复中心
|
||||
杭州云迹物联科技有限公司
|
||||
杭州着墨文化创意者限公司
|
||||
杭州着墨文化创意有限公司
|
||||
亚信科技(南京)有限公司
|
||||
杭州密尔沃智能装备有限公司
|
||||
杭州骏远电子商务有限公司
|
||||
@@ -41,7 +41,7 @@ OPPO广东移动通信有限公司
|
||||
杭州琑为缘文化艺术有限公司
|
||||
浙江丝里伯睡眠科技股份有限公司
|
||||
杭州倍驰科技有限公司
|
||||
杭州心灵部落教育有限公司(灵动生活)
|
||||
杭州心灵部落教育有限公司
|
||||
杭州云印智造科技有限公司
|
||||
浙江海拓环境技术有限公司
|
||||
申能环境科技有限公司
|
||||
@@ -50,7 +50,7 @@ OPPO广东移动通信有限公司
|
||||
杭州字节跳动科技有限公司
|
||||
杭州邻汇网络科技有限公司
|
||||
浙江建盛安全科技有限公司
|
||||
幻想集团·杭州运营中心
|
||||
石家庄幻想企业管理咨询有限公司
|
||||
杭州阿克莱斯设备有限公司
|
||||
浙江省现代农业促进会
|
||||
益思芯科技(杭州)有限公司
|
||||
|
1672
demo/aiqicha-company-detail.html
Normal file
1672
demo/aiqicha-company-detail.html
Normal file
File diff suppressed because one or more lines are too long
927
demo/aiqicha-home-logined.html
Normal file
927
demo/aiqicha-home-logined.html
Normal file
File diff suppressed because one or more lines are too long
1
demo/aiqicha-浙江千麦司法鉴定中心.html
Normal file
1
demo/aiqicha-浙江千麦司法鉴定中心.html
Normal file
@@ -0,0 +1 @@
|
||||
浙江千麦司法鉴定中心
|
||||
1262
demo/all_body.html
Normal file
1262
demo/all_body.html
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 466 KiB |
831
demo/qichacha_aiqicha.js
Normal file
831
demo/qichacha_aiqicha.js
Normal file
@@ -0,0 +1,831 @@
|
||||
// ==UserScript==
|
||||
// @name 爱企查&企查查等
|
||||
// @namespace http://tampermonkey.net/
|
||||
// @version 0.2
|
||||
// @description 在页面右下角添加工具按钮,支持复制源码和解析公司信息
|
||||
// @author You
|
||||
// @match https://www.qcc.com/firm/*
|
||||
// @match https://aiqicha.baidu.com/company_detail_*
|
||||
// @grant none
|
||||
// ==/UserScript==
|
||||
|
||||
(function () {
|
||||
"use strict";
|
||||
|
||||
// 工具类 - 存放通用函数
|
||||
class ToolUtils {
|
||||
static cleanText(text) {
|
||||
return text
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/[\r\n\t]/g, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
static extractText(doc, selectors) {
|
||||
for (const selector of selectors) {
|
||||
const element = doc.querySelector(selector);
|
||||
if (element && element.textContent.trim()) {
|
||||
return element.textContent.trim();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
static copyToClipboard(content, successMessage) {
|
||||
const textarea = document.createElement("textarea");
|
||||
textarea.value = content;
|
||||
textarea.style.position = "fixed";
|
||||
textarea.style.top = "0";
|
||||
textarea.style.left = "0";
|
||||
textarea.style.width = "1px";
|
||||
textarea.style.height = "1px";
|
||||
textarea.style.opacity = "0";
|
||||
|
||||
document.body.appendChild(textarea);
|
||||
textarea.select();
|
||||
document.execCommand("copy");
|
||||
document.body.removeChild(textarea);
|
||||
|
||||
if (successMessage) {
|
||||
this.showAutoCloseMessage(successMessage, "success");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static showAutoCloseMessage(message, type = "info") {
|
||||
// 创建一个自动关闭的提示框替代 alert
|
||||
const alertBox = document.createElement("div");
|
||||
alertBox.textContent = message;
|
||||
alertBox.style.position = "fixed";
|
||||
alertBox.style.top = "50%";
|
||||
alertBox.style.left = "50%";
|
||||
alertBox.style.transform = "translate(-50%, -50%)";
|
||||
|
||||
// 根据消息类型设置不同颜色
|
||||
if (type === "success") {
|
||||
alertBox.style.backgroundColor = "#52c41a";
|
||||
} else if (type === "error") {
|
||||
alertBox.style.backgroundColor = "#f5222d";
|
||||
} else {
|
||||
alertBox.style.backgroundColor = "#1890ff";
|
||||
}
|
||||
|
||||
alertBox.style.color = "white";
|
||||
alertBox.style.padding = "10px 20px";
|
||||
alertBox.style.borderRadius = "4px";
|
||||
alertBox.style.zIndex = "10001";
|
||||
alertBox.style.boxShadow = "0 2px 8px rgba(0,0,0,0.15)";
|
||||
alertBox.style.transition = "opacity 0.3s";
|
||||
|
||||
document.body.appendChild(alertBox);
|
||||
|
||||
// 2秒后自动关闭
|
||||
setTimeout(() => {
|
||||
if (document.body.contains(alertBox)) {
|
||||
// 添加淡出效果
|
||||
alertBox.style.opacity = "0";
|
||||
// 真正移除元素
|
||||
setTimeout(() => {
|
||||
if (document.body.contains(alertBox)) {
|
||||
document.body.removeChild(alertBox);
|
||||
}
|
||||
}, 300);
|
||||
}
|
||||
}, 2000);
|
||||
}
|
||||
|
||||
static showResult(data) {
|
||||
const modal = document.createElement("div");
|
||||
modal.style.position = "fixed";
|
||||
modal.style.top = "50%";
|
||||
modal.style.left = "50%";
|
||||
modal.style.transform = "translate(-50%, -50%)";
|
||||
modal.style.width = "600px";
|
||||
modal.style.maxHeight = "80vh";
|
||||
modal.style.overflowY = "auto";
|
||||
modal.style.backgroundColor = "white";
|
||||
modal.style.padding = "20px";
|
||||
modal.style.boxShadow = "0 0 10px rgba(0,0,0,0.3)";
|
||||
modal.style.zIndex = "10000";
|
||||
|
||||
const pre = document.createElement("pre");
|
||||
pre.textContent = JSON.stringify(data, null, 2);
|
||||
pre.style.whiteSpace = "pre-wrap";
|
||||
pre.style.wordWrap = "break-word";
|
||||
|
||||
const copyBtn = document.createElement("button");
|
||||
copyBtn.textContent = "复制JSON";
|
||||
copyBtn.style.marginTop = "10px";
|
||||
copyBtn.style.padding = "8px 16px";
|
||||
copyBtn.style.backgroundColor = "#52c41a";
|
||||
copyBtn.style.color = "white";
|
||||
copyBtn.style.border = "none";
|
||||
copyBtn.style.borderRadius = "4px";
|
||||
copyBtn.style.cursor = "pointer";
|
||||
|
||||
copyBtn.addEventListener("click", () => {
|
||||
navigator.clipboard
|
||||
.writeText(JSON.stringify(data, null, 2))
|
||||
.then(() => this.showAutoCloseMessage("已复制到剪贴板", "success"))
|
||||
.catch((err) => alert("复制失败: " + err));
|
||||
});
|
||||
|
||||
const closeBtn = document.createElement("button");
|
||||
closeBtn.textContent = "关闭";
|
||||
closeBtn.style.marginLeft = "10px";
|
||||
closeBtn.style.marginTop = "10px";
|
||||
closeBtn.style.padding = "8px 16px";
|
||||
closeBtn.style.backgroundColor = "#f5222d";
|
||||
closeBtn.style.color = "white";
|
||||
closeBtn.style.border = "none";
|
||||
closeBtn.style.borderRadius = "4px";
|
||||
closeBtn.style.cursor = "pointer";
|
||||
|
||||
closeBtn.addEventListener("click", () => {
|
||||
document.body.removeChild(modal);
|
||||
});
|
||||
|
||||
modal.innerHTML = '<h2 style="margin-top: 0;">企业信息解析结果</h2>';
|
||||
modal.appendChild(pre);
|
||||
modal.appendChild(document.createElement("br"));
|
||||
modal.appendChild(copyBtn);
|
||||
modal.appendChild(closeBtn);
|
||||
|
||||
document.body.appendChild(modal);
|
||||
// // Automatically close the modal after 2 seconds
|
||||
// setTimeout(() => {
|
||||
// if (document.body.contains(modal)) {
|
||||
// document.body.removeChild(modal);
|
||||
// }
|
||||
// }, 2000);
|
||||
}
|
||||
}
|
||||
|
||||
// 爱企查解析类
|
||||
class AiQiChaParser {
|
||||
getPhoneNumber() {
|
||||
// 查找电话信息容器
|
||||
const phoneContainer = document.querySelector(
|
||||
"div.business-info div.telphone-lists-wrap"
|
||||
);
|
||||
if (!phoneContainer) return "未找到电话信息";
|
||||
|
||||
// 查找包含电话号码的元素
|
||||
const phoneElement = phoneContainer.querySelector("span.copy-box span");
|
||||
if (!phoneElement) return "未找到电话号码";
|
||||
|
||||
return ToolUtils.cleanText(phoneElement.textContent);
|
||||
}
|
||||
constructor() {
|
||||
this.table = null;
|
||||
}
|
||||
|
||||
// 初始化表格
|
||||
initTable() {
|
||||
this.table = document.querySelector("table.zx-detail-basic-table");
|
||||
if (!this.table) {
|
||||
alert("未找到企业信息表格");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// 获取优化后的值
|
||||
getOptimizedValue(title) {
|
||||
const cells = Array.from(this.table.querySelectorAll("td"));
|
||||
const titleCell = cells.find(
|
||||
(cell) => ToolUtils.cleanText(cell.textContent) === title
|
||||
);
|
||||
|
||||
if (!titleCell) return null;
|
||||
|
||||
let valueCell = titleCell.nextElementSibling;
|
||||
if (!valueCell) return null;
|
||||
|
||||
const valueElement =
|
||||
valueCell.querySelector(".enter-bg-ele") ||
|
||||
valueCell.querySelector(".addr-enter-bg-ele") ||
|
||||
valueCell;
|
||||
|
||||
return ToolUtils.cleanText(valueElement.textContent);
|
||||
}
|
||||
|
||||
// 获取法定代表人
|
||||
getLegalRepresentative() {
|
||||
const legalElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => ToolUtils.cleanText(td.textContent) === "法定代表人");
|
||||
|
||||
if (legalElements.length > 0) {
|
||||
const valueCell = legalElements[0].nextElementSibling;
|
||||
if (valueCell && valueCell.classList.contains("image-text-content")) {
|
||||
const nameElement = valueCell.querySelector(".person-name-warp a");
|
||||
if (nameElement) {
|
||||
return ToolUtils.cleanText(nameElement.textContent);
|
||||
}
|
||||
return ToolUtils.cleanText(valueCell.textContent);
|
||||
}
|
||||
}
|
||||
|
||||
const titleElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => td.textContent.includes("法定代表人"));
|
||||
|
||||
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||
const valueCell = titleElements[0].nextElementSibling;
|
||||
return ToolUtils.cleanText(valueCell.textContent);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取统一社会信用代码
|
||||
getUnifiedSocialCreditCode() {
|
||||
const codeElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||
(td) => {
|
||||
return (
|
||||
td.textContent.includes("统一社会信用代码") &&
|
||||
td.nextElementSibling &&
|
||||
td.nextElementSibling.classList.contains("table-regCapital-lable")
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
if (codeElements.length > 0) {
|
||||
const valueCell = codeElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||
(td) => td.textContent.includes("纳税人识别号")
|
||||
);
|
||||
|
||||
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
|
||||
const valueCell = taxElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取工商注册号
|
||||
getBusinessRegistrationNo() {
|
||||
const regElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||
(td) => ToolUtils.cleanText(td.textContent).includes("工商注册号")
|
||||
);
|
||||
|
||||
if (regElements.length > 0 && regElements[0].nextElementSibling) {
|
||||
const valueCell = regElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取组织机构代码
|
||||
getOrganizationCode() {
|
||||
const orgCodeElements = Array.from(
|
||||
this.table.querySelectorAll(".poptip-wrap-org-no")
|
||||
).filter((el) => el.textContent.includes("组织机构代码"));
|
||||
|
||||
if (orgCodeElements.length > 0) {
|
||||
const valueCell = orgCodeElements[0].closest("td").nextElementSibling;
|
||||
if (valueCell && valueCell.classList.contains("enter-bg")) {
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
}
|
||||
|
||||
const titleElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => ToolUtils.cleanText(td.textContent) === "组织机构代码");
|
||||
|
||||
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||
const valueCell = titleElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取纳税人识别号
|
||||
getTaxpayerId() {
|
||||
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||
(td) => ToolUtils.cleanText(td.textContent).includes("纳税人识别号")
|
||||
);
|
||||
|
||||
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
|
||||
const valueCell = taxElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
const creditElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) =>
|
||||
ToolUtils.cleanText(td.textContent).includes("统一社会信用代码")
|
||||
);
|
||||
|
||||
if (creditElements.length > 0 && creditElements[0].nextElementSibling) {
|
||||
const valueCell = creditElements[0].nextElementSibling;
|
||||
const rawValue =
|
||||
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||
valueCell.textContent;
|
||||
return ToolUtils.cleanText(rawValue);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取参保人数
|
||||
getInsuranceNumber() {
|
||||
const insuranceElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => {
|
||||
return (
|
||||
td.textContent.includes("参保人数") &&
|
||||
td.querySelector(".insurance-info")
|
||||
);
|
||||
});
|
||||
|
||||
if (insuranceElements.length > 0) {
|
||||
const valueCell = insuranceElements[0].nextElementSibling;
|
||||
if (!valueCell) return null;
|
||||
|
||||
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
|
||||
const match = rawText.match(/(\d+人)/);
|
||||
return match ? match[0] : null;
|
||||
}
|
||||
|
||||
const registrationElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => td.textContent.includes("登记机关"));
|
||||
|
||||
if (
|
||||
registrationElements.length > 0 &&
|
||||
registrationElements[0].previousElementSibling
|
||||
) {
|
||||
const valueCell = registrationElements[0].previousElementSibling;
|
||||
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
|
||||
const match = rawText.match(/(\d+人)/);
|
||||
return match ? match[0] : null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 获取核准日期
|
||||
getApprovalDate() {
|
||||
const approvalElements = Array.from(
|
||||
this.table.querySelectorAll(".poptip-wrap-annual-date")
|
||||
).filter((el) => el.textContent.includes("核准日期"));
|
||||
|
||||
if (approvalElements.length > 0) {
|
||||
const valueCell = approvalElements[0].closest("td").nextElementSibling;
|
||||
if (valueCell) {
|
||||
const rawValue = valueCell.textContent
|
||||
.replace(/[\r\n\t]/g, "")
|
||||
.trim();
|
||||
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
|
||||
return rawValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const titleElements = Array.from(
|
||||
this.table.querySelectorAll("td")
|
||||
).filter((td) => ToolUtils.cleanText(td.textContent) === "核准日期");
|
||||
|
||||
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||
const valueCell = titleElements[0].nextElementSibling;
|
||||
const rawValue = ToolUtils.cleanText(valueCell.textContent);
|
||||
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
|
||||
return rawValue;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// 解析公司信息主方法
|
||||
parseCompanyInfo() {
|
||||
if (!this.initTable()) return;
|
||||
|
||||
const companyData = {
|
||||
企业名称: this.getOptimizedValue("企业名称"),
|
||||
统一社会信用代码: this.getUnifiedSocialCreditCode(),
|
||||
法定代表人: this.getLegalRepresentative(),
|
||||
电话: this.getPhoneNumber(),
|
||||
经营状态: this.getOptimizedValue("经营状态"),
|
||||
成立日期: this.getOptimizedValue("成立日期"),
|
||||
行政区划: this.getOptimizedValue("行政区划"),
|
||||
注册资本: this.getOptimizedValue("注册资本"),
|
||||
实缴资本: this.getOptimizedValue("实缴资本"),
|
||||
企业类型: this.getOptimizedValue("企业类型"),
|
||||
所属行业: this.getOptimizedValue("所属行业"),
|
||||
工商注册号: this.getBusinessRegistrationNo(),
|
||||
组织机构代码: this.getOrganizationCode(),
|
||||
纳税人识别号: this.getTaxpayerId(),
|
||||
纳税人资质: this.getOptimizedValue("纳税人资质"),
|
||||
营业期限: this.getOptimizedValue("营业期限"),
|
||||
核准日期: this.getApprovalDate(),
|
||||
参保人数: this.getInsuranceNumber(),
|
||||
登记机关: this.getOptimizedValue("登记机关"),
|
||||
曾用名: this.getOptimizedValue("曾用名"),
|
||||
注册地址: this.getOptimizedValue("注册地址"),
|
||||
经营范围: this.getOptimizedValue("经营范围"),
|
||||
};
|
||||
|
||||
ToolUtils.showResult(companyData);
|
||||
}
|
||||
}
|
||||
|
||||
// QCC解析类 企查查
|
||||
class QCCParser {
|
||||
constructor() {
|
||||
this.table = null;
|
||||
}
|
||||
|
||||
// 初始化表格
|
||||
initTable() {
|
||||
const cominfoNormal = document.querySelector("div.cominfo-normal");
|
||||
if (!cominfoNormal) {
|
||||
alert("未找到企业信息容器");
|
||||
return false;
|
||||
}
|
||||
|
||||
this.table = cominfoNormal.querySelector("table.ntable");
|
||||
if (!this.table) {
|
||||
alert("未找到企业信息表格");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// 获取优化后的值
|
||||
getOptimizedValue(title) {
|
||||
const headerCells = Array.from(this.table.querySelectorAll("td.tb"));
|
||||
let value = null;
|
||||
|
||||
headerCells.forEach((header) => {
|
||||
if (ToolUtils.cleanText(header.textContent).includes(title)) {
|
||||
const valueCell = header.nextElementSibling;
|
||||
if (valueCell) {
|
||||
// 尝试从copy-value类中获取值
|
||||
const copyValue = valueCell.querySelector(".copy-value");
|
||||
if (copyValue) {
|
||||
value = ToolUtils.cleanText(copyValue.textContent);
|
||||
} else {
|
||||
value = ToolUtils.cleanText(valueCell.textContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// 获取法定代表人
|
||||
getLegalRepresentative() {
|
||||
// Try the existing method first
|
||||
const basicValue =
|
||||
this.getOptimizedValue("法定代表人") || this.getOptimizedValue("法人");
|
||||
if (basicValue && basicValue.trim()) {
|
||||
// Remove any "关联企业 X" text
|
||||
return basicValue.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||
}
|
||||
|
||||
// If basic extraction fails, try more specific approach
|
||||
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
|
||||
(cell) => ToolUtils.cleanText(cell.textContent).includes("法定代表人")
|
||||
);
|
||||
|
||||
if (!headerCell) return null;
|
||||
|
||||
const valueCell = headerCell.nextElementSibling;
|
||||
if (!valueCell) return null;
|
||||
|
||||
// Try to find the name within the complex structure
|
||||
// Look for anchor tags with target="_blank" which typically contain the legal representative's name
|
||||
const nameLinks = valueCell.querySelectorAll('a[target="_blank"]');
|
||||
for (const link of nameLinks) {
|
||||
const name = ToolUtils.cleanText(link.textContent);
|
||||
// Make sure it's not empty and doesn't contain obvious non-name text
|
||||
if (name && !name.includes("关联企业") && !name.includes("复制")) {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
||||
// Alternative approach - look for the first anchor tag in the cell
|
||||
const firstLink = valueCell.querySelector("a");
|
||||
if (firstLink) {
|
||||
const name = ToolUtils.cleanText(firstLink.textContent);
|
||||
// Remove any trailing "关联企业 X" text
|
||||
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||
}
|
||||
|
||||
// Fallback to general value extraction
|
||||
const copyValue = valueCell.querySelector(".copy-value");
|
||||
if (copyValue) {
|
||||
const name = ToolUtils.cleanText(copyValue.textContent);
|
||||
// Remove any trailing "关联企业 X" text
|
||||
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||
}
|
||||
|
||||
const rawText = ToolUtils.cleanText(valueCell.textContent);
|
||||
// Remove any trailing "关联企业 X" text
|
||||
return rawText.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||
}
|
||||
|
||||
// 获取统一社会信用代码
|
||||
getUnifiedSocialCreditCode() {
|
||||
return (
|
||||
this.getOptimizedValue("统一社会信用代码") ||
|
||||
this.getOptimizedValue("信用代码")
|
||||
);
|
||||
}
|
||||
|
||||
// 获取工商注册号
|
||||
getBusinessRegistrationNo() {
|
||||
return (
|
||||
this.getOptimizedValue("工商注册号") || this.getOptimizedValue("注册号")
|
||||
);
|
||||
}
|
||||
|
||||
// 获取组织机构代码
|
||||
getOrganizationCode() {
|
||||
return this.getOptimizedValue("组织机构代码");
|
||||
}
|
||||
|
||||
// 获取纳税人识别号
|
||||
getTaxpayerId() {
|
||||
return (
|
||||
this.getOptimizedValue("纳税人识别号") ||
|
||||
this.getUnifiedSocialCreditCode()
|
||||
);
|
||||
}
|
||||
|
||||
// 获取参保人数
|
||||
getInsuranceNumber() {
|
||||
// 查找参保人数表头
|
||||
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
|
||||
(cell) => ToolUtils.cleanText(cell.textContent).includes("参保人数")
|
||||
);
|
||||
|
||||
if (!headerCell) return null;
|
||||
|
||||
const valueCell = headerCell.nextElementSibling;
|
||||
if (!valueCell) return null;
|
||||
|
||||
// 提取参保人数数字
|
||||
const numberSpan = valueCell.querySelector("span");
|
||||
const number = numberSpan
|
||||
? ToolUtils.cleanText(numberSpan.textContent)
|
||||
: null;
|
||||
|
||||
// 提取年报年份
|
||||
const reportLink = valueCell.querySelector("a.m-l-r-10");
|
||||
const reportYear = reportLink
|
||||
? ToolUtils.cleanText(reportLink.textContent)
|
||||
: "";
|
||||
|
||||
// 组合结果
|
||||
return number ? `${number}人 ${reportYear}` : null;
|
||||
}
|
||||
|
||||
// 获取联系电话
|
||||
getPhoneNumber() {
|
||||
// 查找联系信息容器
|
||||
const contactInfo = document.querySelector("div.contact-info");
|
||||
if (!contactInfo) return null;
|
||||
|
||||
// 查找右侧信息区域
|
||||
const rightPart = contactInfo.querySelector("div.main-part-item.right");
|
||||
if (!rightPart) return null;
|
||||
|
||||
// 查找包含电话的行
|
||||
const rows = Array.from(rightPart.querySelectorAll("div.rline"));
|
||||
const phoneRow = rows.find((row) =>
|
||||
ToolUtils.cleanText(row.textContent).includes("电话:")
|
||||
);
|
||||
|
||||
if (!phoneRow) return null;
|
||||
|
||||
// 提取电话号码
|
||||
const spans = Array.from(
|
||||
phoneRow.querySelectorAll("span.need-copy-field")
|
||||
);
|
||||
const phoneSpan = spans.find(
|
||||
(span) => !ToolUtils.cleanText(span.textContent).includes("电话:")
|
||||
);
|
||||
|
||||
return phoneSpan ? ToolUtils.cleanText(phoneSpan.textContent) : null;
|
||||
}
|
||||
|
||||
// 获取核准日期
|
||||
getApprovalDate() {
|
||||
return (
|
||||
this.getOptimizedValue("核准日期") || this.getOptimizedValue("成立日期")
|
||||
);
|
||||
}
|
||||
|
||||
// 解析公司信息主方法
|
||||
parseCompanyInfo() {
|
||||
if (!this.initTable()) return;
|
||||
|
||||
const companyData = {
|
||||
企业名称:
|
||||
this.getOptimizedValue("企业名称") ||
|
||||
this.getOptimizedValue("公司名称"),
|
||||
统一社会信用代码: this.getUnifiedSocialCreditCode(),
|
||||
法定代表人: this.getLegalRepresentative(),
|
||||
经营状态: this.getOptimizedValue("登记状态"),
|
||||
成立日期: this.getOptimizedValue("成立日期"),
|
||||
行政区划: this.getOptimizedValue("行政区划"),
|
||||
注册资本: this.getOptimizedValue("注册资本"),
|
||||
实缴资本: this.getOptimizedValue("实缴资本"),
|
||||
企业类型: this.getOptimizedValue("企业类型"),
|
||||
所属行业: this.getOptimizedValue("国标行业"),
|
||||
工商注册号: this.getBusinessRegistrationNo(),
|
||||
组织机构代码: this.getOrganizationCode(),
|
||||
纳税人识别号: this.getTaxpayerId(),
|
||||
纳税人资质: this.getOptimizedValue("纳税人资质"),
|
||||
营业期限: this.getOptimizedValue("营业期限"),
|
||||
核准日期: this.getApprovalDate(),
|
||||
参保人数: this.getInsuranceNumber(),
|
||||
电话: this.getPhoneNumber(),
|
||||
登记机关: this.getOptimizedValue("登记机关"),
|
||||
曾用名: this.getOptimizedValue("曾用名"),
|
||||
注册地址: this.getOptimizedValue("注册地址"),
|
||||
经营范围: this.getOptimizedValue("经营范围"),
|
||||
};
|
||||
|
||||
ToolUtils.showResult(companyData);
|
||||
}
|
||||
}
|
||||
|
||||
// 创建按钮容器
|
||||
function createButtonContainer() {
|
||||
const container = document.createElement("div");
|
||||
container.id = "tool-container";
|
||||
Object.assign(container.style, {
|
||||
position: "fixed",
|
||||
right: "20px",
|
||||
bottom: "20px",
|
||||
zIndex: "9999",
|
||||
display: "flex",
|
||||
flexDirection: "column",
|
||||
gap: "10px",
|
||||
width: "40px",
|
||||
height: "40px",
|
||||
backgroundColor: "#4CAF50",
|
||||
borderRadius: "50%",
|
||||
transition: "all 0.3s ease",
|
||||
overflow: "hidden",
|
||||
cursor: "move",
|
||||
});
|
||||
|
||||
// +号指示器
|
||||
const plusSign = document.createElement("div");
|
||||
plusSign.textContent = "+";
|
||||
Object.assign(plusSign.style, {
|
||||
color: "white",
|
||||
fontSize: "24px",
|
||||
textAlign: "center",
|
||||
lineHeight: "40px",
|
||||
width: "100%",
|
||||
});
|
||||
container.appendChild(plusSign);
|
||||
|
||||
// 悬停展开效果
|
||||
container.addEventListener("mouseenter", () => {
|
||||
container.style.width = "150px";
|
||||
container.style.height = "auto";
|
||||
container.style.borderRadius = "8px";
|
||||
});
|
||||
|
||||
container.addEventListener("mouseleave", () => {
|
||||
container.style.width = "40px";
|
||||
container.style.height = "40px";
|
||||
container.style.borderRadius = "50%";
|
||||
});
|
||||
|
||||
// 添加拖动功能
|
||||
let isDragging = false;
|
||||
let offsetX, offsetY;
|
||||
|
||||
// 鼠标按下开始拖动
|
||||
container.addEventListener("mousedown", (e) => {
|
||||
// 只有点击+号区域才允许拖动
|
||||
if (e.target === plusSign || e.target === container) {
|
||||
isDragging = true;
|
||||
const rect = container.getBoundingClientRect();
|
||||
offsetX = e.clientX - rect.left;
|
||||
offsetY = e.clientY - rect.top;
|
||||
container.style.cursor = "grabbing";
|
||||
// 阻止事件冒泡和默认行为
|
||||
e.stopPropagation();
|
||||
e.preventDefault();
|
||||
}
|
||||
});
|
||||
|
||||
// 鼠标移动时更新位置
|
||||
document.addEventListener("mousemove", (e) => {
|
||||
if (!isDragging) return;
|
||||
container.style.left = e.clientX - offsetX + "px";
|
||||
container.style.top = e.clientY - offsetY + "px";
|
||||
container.style.right = "auto";
|
||||
container.style.bottom = "auto";
|
||||
});
|
||||
|
||||
// 鼠标释放结束拖动
|
||||
document.addEventListener("mouseup", () => {
|
||||
if (isDragging) {
|
||||
isDragging = false;
|
||||
container.style.cursor = "move";
|
||||
}
|
||||
});
|
||||
// 创建功能按钮
|
||||
function createButton(text, onClick) {
|
||||
const button = document.createElement("button");
|
||||
button.textContent = text;
|
||||
Object.assign(button.style, {
|
||||
padding: "8px 12px",
|
||||
border: "none",
|
||||
borderRadius: "4px",
|
||||
backgroundColor: "white",
|
||||
color: "#333",
|
||||
cursor: "pointer",
|
||||
width: "100%",
|
||||
transition: "backgroundColor 0.2s",
|
||||
});
|
||||
button.addEventListener(
|
||||
"mouseenter",
|
||||
() => (button.style.backgroundColor = "#f0f0f0")
|
||||
);
|
||||
button.addEventListener(
|
||||
"mouseleave",
|
||||
() => (button.style.backgroundColor = "white")
|
||||
);
|
||||
button.addEventListener("click", onClick);
|
||||
return button;
|
||||
}
|
||||
|
||||
// 复制源码按钮
|
||||
const copySourceButton = createButton("复制源码", () => {
|
||||
const html = document.documentElement.outerHTML;
|
||||
|
||||
copyToClipboard(html, "HTML源码已复制到剪贴板");
|
||||
/*
|
||||
navigator.clipboard
|
||||
.writeText(html)
|
||||
.then(() => {
|
||||
alert("源码已复制到剪贴板");
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("复制失败:", err);
|
||||
});
|
||||
*/
|
||||
});
|
||||
|
||||
// 解析公司信息按钮
|
||||
const parseInfoButton = createButton("解析公司信息", () => {
|
||||
// 根据当前URL选择对应的解析器
|
||||
let parser;
|
||||
if (window.location.host.includes("aiqicha.baidu.com")) {
|
||||
parser = new AiQiChaParser();
|
||||
} else if (window.location.host.includes("qcc.com")) {
|
||||
parser = new QCCParser();
|
||||
} else {
|
||||
alert("不支持的网站");
|
||||
return;
|
||||
}
|
||||
parser.parseCompanyInfo();
|
||||
});
|
||||
|
||||
// 添加按钮到容器
|
||||
container.appendChild(copySourceButton);
|
||||
container.appendChild(parseInfoButton);
|
||||
|
||||
document.body.appendChild(container);
|
||||
}
|
||||
|
||||
// 页面加载完成后创建按钮
|
||||
window.addEventListener("load", createButtonContainer);
|
||||
})();
|
||||
24
parse_img/paddle_ocr_test.py
Normal file
24
parse_img/paddle_ocr_test.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False, # 通过 use_doc_orientation_classify 参数指定不使用文档方向分类模型
|
||||
use_doc_unwarping=False, # 通过 use_doc_unwarping 参数指定不使用文本图像矫正模型
|
||||
use_textline_orientation=False, # 通过 use_textline_orientation 参数指定不使用文本行方向分类模型
|
||||
lang="ch",
|
||||
#use_angle_cls=True,
|
||||
)
|
||||
# ocr = PaddleOCR(lang="en") # 通过 lang 参数来使用英文模型
|
||||
# ocr = PaddleOCR(ocr_version="PP-OCRv4") # 通过 ocr_version 参数来使用 PP-OCR 其他版本
|
||||
# ocr = PaddleOCR(device="gpu") # 通过 device 参数使得在模型推理时使用 GPU
|
||||
# ocr = PaddleOCR(
|
||||
# text_detection_model_name="PP-OCRv5_server_det",
|
||||
# text_recognition_model_name="PP-OCRv5_server_rec",
|
||||
# use_doc_orientation_classify=False,
|
||||
# use_doc_unwarping=False,
|
||||
# use_textline_orientation=False,
|
||||
# ) # 更换 PP-OCRv5_server 模型
|
||||
result = ocr.predict("D:/gitstudy/pythonwork/manchuspider/data/满洲语字典/满汉大辞典/images/1.png")
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
@@ -28,11 +28,11 @@ class AiqichaDetailCrawler:
|
||||
print("已加载Cookie")
|
||||
|
||||
# 使用登录管理器检测登录状态
|
||||
logined = self.login_manager.check_and_login()
|
||||
if logined:
|
||||
print("登录成功")
|
||||
else:
|
||||
print("登录失败")
|
||||
# logined = self.login_manager.check_and_login()
|
||||
# if logined:
|
||||
# print("登录成功")
|
||||
# else:
|
||||
# print("登录失败")
|
||||
self.browser_started = True
|
||||
except Exception as e:
|
||||
print(f"启动浏览器失败: {e}")
|
||||
@@ -85,15 +85,18 @@ class AiqichaDetailCrawler:
|
||||
|
||||
# 额外等待一段时间确保页面完全加载
|
||||
import time
|
||||
time.sleep(2)
|
||||
time.sleep(10)
|
||||
print("额外等待完成,页面应该已完全加载")
|
||||
except Exception as e:
|
||||
print(f"等待页面元素时出错: {e}")
|
||||
print("继续尝试解析页面内容...")
|
||||
|
||||
|
||||
self.browser.save_cookies()
|
||||
|
||||
# 提取基本信息
|
||||
print("开始解析页面信息...")
|
||||
parser = AiqichaDetailParser(self.browser.page)
|
||||
parser = AiqichaDetailParser(self.browser)
|
||||
company_info = parser.parse_company_info()
|
||||
|
||||
print(f"成功爬取企业信息: {company_info['name']}")
|
||||
@@ -101,7 +104,6 @@ class AiqichaDetailCrawler:
|
||||
else:
|
||||
print("访问页面失败")
|
||||
return {}
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取过程中出现错误: {e}")
|
||||
return {}
|
||||
|
||||
@@ -6,14 +6,15 @@ import re
|
||||
class AiqichaDetailParser:
|
||||
"""爱企查企业详情页解析器"""
|
||||
|
||||
def __init__(self, page):
|
||||
def __init__(self, browser):
|
||||
"""
|
||||
初始化解析器
|
||||
|
||||
Args:
|
||||
page: 浏览器页面对象
|
||||
browser: 浏览器页面对象
|
||||
"""
|
||||
self.page = page
|
||||
self.browser = browser
|
||||
# self.self.browser
|
||||
|
||||
def parse_company_info(self):
|
||||
"""
|
||||
@@ -48,62 +49,98 @@ class AiqichaDetailParser:
|
||||
}
|
||||
|
||||
# 批量提取信息
|
||||
for field, selectors in fields.items():
|
||||
company_info[field] = self._extract_field_value(selectors)
|
||||
# 爱企查页面使用表格结构,需要特殊处理
|
||||
field_mapping = {
|
||||
'legal_representative': '法定代表人',
|
||||
'business_scope': '经营范围',
|
||||
'credit_code': '统一社会信用代码',
|
||||
'registered_capital': '注册资本',
|
||||
'establishment_date': '成立日期',
|
||||
'business_status': '经营状态',
|
||||
'company_type': '企业类型',
|
||||
'registration_authority': '登记机关',
|
||||
'operating_period': '营业期限',
|
||||
'address': '注册地址'
|
||||
}
|
||||
for field_name, field_text in field_mapping.items():
|
||||
company_info[field_name] = self._extract_field_value(field_text)
|
||||
|
||||
# 特殊处理电话号码
|
||||
company_info['phone'] = self._extract_phone_number()
|
||||
|
||||
company_info['name'] = self._extract_company_name()
|
||||
return company_info
|
||||
|
||||
def _extract_field_value(self, selectors):
|
||||
def _extract_company_name(self):
|
||||
"""
|
||||
根据多个选择器提取字段值
|
||||
|
||||
Args:
|
||||
selectors (list): CSS选择器列表
|
||||
提取企业名称
|
||||
|
||||
Returns:
|
||||
str: 提取到的值或"未知"
|
||||
str: 企业名称或"未知"
|
||||
"""
|
||||
for selector in selectors:
|
||||
try:
|
||||
# 添加日志:显示当前尝试的选择器
|
||||
print(f"尝试选择器: {selector}")
|
||||
try:
|
||||
# 尝试多种方式获取企业名称
|
||||
selectors = [
|
||||
'title', # 页面标题
|
||||
'.company-name', # 常见的公司名称类
|
||||
'h1.enterprise-name', # 企业名称标题
|
||||
'.company-title' # 其他可能的类名
|
||||
]
|
||||
|
||||
# 尝试查找带有 enter-bg-ele 类的元素
|
||||
element = self.page.query_selector(f"{selector} .enter-bg-ele")
|
||||
if element:
|
||||
print(f"找到 enter-bg-ele 元素,选择器: {selector} .enter-bg-ele")
|
||||
else:
|
||||
# 尝试查找带有 addr-enter-bg-ele 类的元素
|
||||
element = self.page.query_selector(f"{selector} .addr-enter-bg-ele")
|
||||
for selector in selectors:
|
||||
try:
|
||||
element = self.browser.page.query_selector(selector)
|
||||
if element:
|
||||
print(f"找到 addr-enter-bg-ele 元素,选择器: {selector} .addr-enter-bg-ele")
|
||||
else:
|
||||
# 直接查找元素
|
||||
element = self.page.query_selector(selector)
|
||||
if element:
|
||||
print(f"找到直接元素,选择器: {selector}")
|
||||
text = element.inner_text().strip()
|
||||
# 如果是标题,可能需要去除后缀
|
||||
if selector == 'title' and '-' in text:
|
||||
text = text.split('-')[0].strip()
|
||||
text = self._clean_text(text)
|
||||
if text and text != "未知":
|
||||
return text
|
||||
except:
|
||||
continue
|
||||
return "未知"
|
||||
except Exception as e:
|
||||
print(f"提取企业名称时出错: {e}")
|
||||
return "未知"
|
||||
|
||||
def _extract_field_value(self, field_text):
|
||||
"""
|
||||
根据多个选择器提取字段值,适配爱企查实际页面结构
|
||||
"""
|
||||
|
||||
|
||||
# for field_name, field_text in field_mapping.items():
|
||||
try:
|
||||
# 查找包含特定文本的td元素
|
||||
title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")')
|
||||
if title_element:
|
||||
# 获取相邻的td元素(包含实际值)
|
||||
value_element = title_element.evaluate_handle('el => el.nextElementSibling')
|
||||
if value_element:
|
||||
text = value_element.inner_text().strip()
|
||||
# 清理文本,移除前缀
|
||||
if ":" in text:
|
||||
text = text.split(":", 1)[1].strip()
|
||||
|
||||
# 特殊处理法定代表人字段,去除"TA有X家企业"等额外信息
|
||||
if field_text == "法定代表人":
|
||||
# 移除类似"TA有12家企业"的额外信息
|
||||
text = re.sub(r'\s*TA有\d+家企业.*$', '', text)
|
||||
|
||||
# 特殊处理地址字段,去除"查看地图"等额外信息
|
||||
if field_text == "注册地址":
|
||||
# 移除"查看地图"等额外信息
|
||||
text = re.sub(r'\s*查看地图.*$', '', text)
|
||||
text = re.sub(r'\s*附近企业.*$', '', text)
|
||||
|
||||
if element:
|
||||
text = element.inner_text().strip()
|
||||
print(f"提取到原始文本: '{text}'")
|
||||
# 清理文本内容
|
||||
text = self._clean_text(text)
|
||||
print(f"清理后文本: '{text}'")
|
||||
if text:
|
||||
print(f"返回文本: '{text}'")
|
||||
return text
|
||||
else:
|
||||
print("文本为空或仅包含空白字符")
|
||||
else:
|
||||
print(f"未找到元素,选择器: {selector}")
|
||||
except Exception as e:
|
||||
print(f"提取字段时出错,选择器: {selector}, 错误: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"提取字段 {field_text} 时出错: {e}")
|
||||
# continue
|
||||
|
||||
print("所有选择器都未找到有效元素,返回默认值")
|
||||
return "未知"
|
||||
|
||||
def _clean_text(self, text):
|
||||
@@ -131,7 +168,7 @@ class AiqichaDetailParser:
|
||||
"""
|
||||
try:
|
||||
# 查找电话信息容器
|
||||
phone_container = self.page.query_selector("div.business-info div.telphone-lists-wrap")
|
||||
phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap")
|
||||
if phone_container:
|
||||
# 查找包含电话号码的元素
|
||||
phone_element = phone_container.query_selector("span.copy-box span")
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user