cookies
This commit is contained in:
3
.idea/.gitignore
generated
vendored
Normal file
3
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
7
.idea/MarsCodeWorkspaceAppSettings.xml
generated
Normal file
7
.idea/MarsCodeWorkspaceAppSettings.xml
generated
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="com.codeverse.userSettings.MarscodeWorkspaceAppSettingsState">
|
||||||
|
<option name="ckgOperationStatus" value="SUCCESS" />
|
||||||
|
<option name="progress" value="1.0" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
10
.idea/SearchCompany.iml
generated
Normal file
10
.idea/SearchCompany.iml
generated
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.13 (SearchCompany)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
6
.idea/misc.xml
generated
Normal file
6
.idea/misc.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.13 (SearchCompany)" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/SearchCompany.iml" filepath="$PROJECT_DIR$/.idea/SearchCompany.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
BIN
OPPO广东移动通信有限公司-OPPO广东移动通信有限公司.xlsx
Normal file
BIN
OPPO广东移动通信有限公司-OPPO广东移动通信有限公司.xlsx
Normal file
Binary file not shown.
BIN
__pycache__/config.cpython-313.pyc
Normal file
BIN
__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
0
company/aiqicha.py
Normal file
0
company/aiqicha.py
Normal file
344
company/qcc.py
Normal file
344
company/qcc.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
# qcc.py
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
"""
|
||||||
|
清理文本内容,去除多余空白字符
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
# 替换多个空白字符为单个空格,并去除首尾空白
|
||||||
|
return re.sub(r'\s+', ' ', text.replace('\r', '').replace('\n', '').replace('\t', '')).strip()
|
||||||
|
|
||||||
|
|
||||||
|
class QCCParser:
|
||||||
|
def __init__(self, page):
|
||||||
|
self.page = page
|
||||||
|
self.company_data = {}
|
||||||
|
|
||||||
|
def init_table(self):
|
||||||
|
"""
|
||||||
|
初始化表格元素
|
||||||
|
"""
|
||||||
|
# 等待页面加载完成
|
||||||
|
self.page.wait_for_load_state('networkidle')
|
||||||
|
|
||||||
|
# 查找企业信息容器
|
||||||
|
cominfo_normal = self.page.query_selector("div.cominfo-normal")
|
||||||
|
if not cominfo_normal:
|
||||||
|
print("未找到企业信息容器")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 查找企业信息表格
|
||||||
|
self.table = cominfo_normal.query_selector("table.ntable")
|
||||||
|
if not self.table:
|
||||||
|
print("未找到企业信息表格")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_optimized_value(self, title):
|
||||||
|
"""
|
||||||
|
根据标题获取对应的值
|
||||||
|
"""
|
||||||
|
# 查找所有表头单元格
|
||||||
|
header_cells = self.table.query_selector_all("td.tb")
|
||||||
|
value = None
|
||||||
|
|
||||||
|
for header in header_cells:
|
||||||
|
if clean_text(header.text_content()).find(title) != -1:
|
||||||
|
value_cell = header.query_selector("+ td")
|
||||||
|
if value_cell:
|
||||||
|
# 尝试从copy-value类中获取值
|
||||||
|
copy_value = value_cell.query_selector(".copy-value")
|
||||||
|
if copy_value:
|
||||||
|
value = clean_text(copy_value.text_content())
|
||||||
|
else:
|
||||||
|
value = clean_text(value_cell.text_content())
|
||||||
|
break
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
def get_legal_representative(self):
|
||||||
|
"""
|
||||||
|
获取法定代表人
|
||||||
|
"""
|
||||||
|
# 尝试基本方法获取
|
||||||
|
basic_value = self.get_optimized_value("法定代表人") or self.get_optimized_value("法人")
|
||||||
|
if basic_value and basic_value.strip():
|
||||||
|
# 移除"关联企业 X"等附加信息
|
||||||
|
return re.sub(r'\s*关联企业\s*\d+$', '', basic_value).strip()
|
||||||
|
|
||||||
|
# 查找法定代表人表头
|
||||||
|
header_cell = None
|
||||||
|
for cell in self.table.query_selector_all("td.tb"):
|
||||||
|
if clean_text(cell.text_content()).find("法定代表人") != -1:
|
||||||
|
header_cell = cell
|
||||||
|
break
|
||||||
|
|
||||||
|
if not header_cell:
|
||||||
|
return None
|
||||||
|
|
||||||
|
value_cell = header_cell.query_selector("+ td")
|
||||||
|
if not value_cell:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 查找包含法定代表人姓名的链接
|
||||||
|
name_links = value_cell.query_selector_all('a[target="_blank"]')
|
||||||
|
for link in name_links:
|
||||||
|
name = clean_text(link.text_content())
|
||||||
|
# 确保不是空值且不包含非姓名文本
|
||||||
|
if name and "关联企业" not in name and "复制" not in name:
|
||||||
|
return name
|
||||||
|
|
||||||
|
# 备选方案:查找第一个链接
|
||||||
|
first_link = value_cell.query_selector("a")
|
||||||
|
if first_link:
|
||||||
|
name = clean_text(first_link.text_content())
|
||||||
|
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
|
||||||
|
|
||||||
|
# 最后备选方案:直接提取文本
|
||||||
|
copy_value = value_cell.query_selector(".copy-value")
|
||||||
|
if copy_value:
|
||||||
|
name = clean_text(copy_value.text_content())
|
||||||
|
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
|
||||||
|
|
||||||
|
raw_text = clean_text(value_cell.text_content())
|
||||||
|
return re.sub(r'\s*关联企业\s*\d+$', '', raw_text).strip()
|
||||||
|
|
||||||
|
def get_unified_social_credit_code(self):
|
||||||
|
"""
|
||||||
|
获取统一社会信用代码
|
||||||
|
"""
|
||||||
|
return (self.get_optimized_value("统一社会信用代码") or
|
||||||
|
self.get_optimized_value("信用代码"))
|
||||||
|
|
||||||
|
def get_business_registration_no(self):
|
||||||
|
"""
|
||||||
|
获取工商注册号
|
||||||
|
"""
|
||||||
|
return (self.get_optimized_value("工商注册号") or
|
||||||
|
self.get_optimized_value("注册号"))
|
||||||
|
|
||||||
|
def get_organization_code(self):
|
||||||
|
"""
|
||||||
|
获取组织机构代码
|
||||||
|
"""
|
||||||
|
return self.get_optimized_value("组织机构代码")
|
||||||
|
|
||||||
|
def get_taxpayer_id(self):
|
||||||
|
"""
|
||||||
|
获取纳税人识别号
|
||||||
|
"""
|
||||||
|
return (self.get_optimized_value("纳税人识别号") or
|
||||||
|
self.get_unified_social_credit_code())
|
||||||
|
|
||||||
|
def get_insurance_number(self):
|
||||||
|
"""
|
||||||
|
获取参保人数
|
||||||
|
"""
|
||||||
|
# 查找参保人数表头
|
||||||
|
header_cell = None
|
||||||
|
for cell in self.table.query_selector_all("td.tb"):
|
||||||
|
if clean_text(cell.text_content()).find("参保人数") != -1:
|
||||||
|
header_cell = cell
|
||||||
|
break
|
||||||
|
|
||||||
|
if not header_cell:
|
||||||
|
return None
|
||||||
|
|
||||||
|
value_cell = header_cell.query_selector("+ td")
|
||||||
|
if not value_cell:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 提取参保人数数字
|
||||||
|
number_span = value_cell.query_selector("span")
|
||||||
|
number = clean_text(number_span.text_content()) if number_span else None
|
||||||
|
|
||||||
|
# 提取年报年份
|
||||||
|
report_link = value_cell.query_selector("a.m-l-r-10")
|
||||||
|
report_year = clean_text(report_link.text_content()) if report_link else ""
|
||||||
|
|
||||||
|
# 组合结果
|
||||||
|
return f"{number}人 {report_year}" if number else None
|
||||||
|
|
||||||
|
def get_phone_number(self):
|
||||||
|
"""
|
||||||
|
获取联系电话
|
||||||
|
"""
|
||||||
|
# 查找联系信息容器
|
||||||
|
contact_info = self.page.query_selector("div.contact-info")
|
||||||
|
if not contact_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 查找右侧信息区域
|
||||||
|
right_part = contact_info.query_selector("div.main-part-item.right")
|
||||||
|
if not right_part:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 查找包含电话的行
|
||||||
|
rows = right_part.query_selector_all("div.rline")
|
||||||
|
phone_row = None
|
||||||
|
for row in rows:
|
||||||
|
if clean_text(row.text_content()).find("电话:") != -1:
|
||||||
|
phone_row = row
|
||||||
|
break
|
||||||
|
|
||||||
|
if not phone_row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 提取电话号码
|
||||||
|
spans = phone_row.query_selector_all("span.need-copy-field")
|
||||||
|
phone_span = None
|
||||||
|
for span in spans:
|
||||||
|
if clean_text(span.text_content()).find("电话:") == -1:
|
||||||
|
phone_span = span
|
||||||
|
break
|
||||||
|
|
||||||
|
return clean_text(phone_span.text_content()) if phone_span else None
|
||||||
|
|
||||||
|
def get_approval_date(self):
|
||||||
|
"""
|
||||||
|
获取核准日期
|
||||||
|
"""
|
||||||
|
return (self.get_optimized_value("核准日期") or
|
||||||
|
self.get_optimized_value("成立日期"))
|
||||||
|
|
||||||
|
def parse_company_info(self):
|
||||||
|
"""
|
||||||
|
解析公司信息主方法
|
||||||
|
"""
|
||||||
|
if not self.init_table():
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.company_data = {
|
||||||
|
"企业名称": (self.get_optimized_value("企业名称") or
|
||||||
|
self.get_optimized_value("公司名称")),
|
||||||
|
"统一社会信用代码": self.get_unified_social_credit_code(),
|
||||||
|
"法定代表人": self.get_legal_representative(),
|
||||||
|
"经营状态": self.get_optimized_value("登记状态"),
|
||||||
|
"成立日期": self.get_optimized_value("成立日期"),
|
||||||
|
"行政区划": self.get_optimized_value("行政区划"),
|
||||||
|
"注册资本": self.get_optimized_value("注册资本"),
|
||||||
|
"实缴资本": self.get_optimized_value("实缴资本"),
|
||||||
|
"企业类型": self.get_optimized_value("企业类型"),
|
||||||
|
"所属行业": self.get_optimized_value("国标行业"),
|
||||||
|
"工商注册号": self.get_business_registration_no(),
|
||||||
|
"组织机构代码": self.get_organization_code(),
|
||||||
|
"纳税人识别号": self.get_taxpayer_id(),
|
||||||
|
"纳税人资质": self.get_optimized_value("纳税人资质"),
|
||||||
|
"营业期限": self.get_optimized_value("营业期限"),
|
||||||
|
"核准日期": self.get_approval_date(),
|
||||||
|
"参保人数": self.get_insurance_number(),
|
||||||
|
"电话": self.get_phone_number(),
|
||||||
|
"登记机关": self.get_optimized_value("登记机关"),
|
||||||
|
"曾用名": self.get_optimized_value("曾用名"),
|
||||||
|
"注册地址": self.get_optimized_value("注册地址"),
|
||||||
|
"经营范围": self.get_optimized_value("经营范围"),
|
||||||
|
}
|
||||||
|
|
||||||
|
return self.company_data
|
||||||
|
|
||||||
|
|
||||||
|
def load_cookies(context, cookie_file):
|
||||||
|
"""
|
||||||
|
从文件加载cookies
|
||||||
|
"""
|
||||||
|
if os.path.exists(cookie_file):
|
||||||
|
with open(cookie_file, 'r') as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
context.add_cookies(cookies)
|
||||||
|
print("已加载本地cookies")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def save_cookies(context, cookie_file):
|
||||||
|
"""
|
||||||
|
保存cookies到文件
|
||||||
|
"""
|
||||||
|
cookies = context.cookies()
|
||||||
|
with open(cookie_file, 'w') as f:
|
||||||
|
json.dump(cookies, f)
|
||||||
|
print("已保存cookies到文件")
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_login(page, cookie_file):
|
||||||
|
"""
|
||||||
|
等待用户扫码登录
|
||||||
|
"""
|
||||||
|
print("检测到需要登录,请使用手机扫码登录...")
|
||||||
|
print("登录成功后将自动跳转到目标页面")
|
||||||
|
|
||||||
|
# 等待页面跳转到非登录页面
|
||||||
|
page.wait_for_url("**/weblogin", timeout=3000)
|
||||||
|
page.wait_for_url(lambda url: "weblogin" not in url, timeout=120000)
|
||||||
|
|
||||||
|
# 保存登录后的cookies
|
||||||
|
save_cookies(page.context, cookie_file)
|
||||||
|
print("登录成功,已保存cookies")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='解析企查查公司信息')
|
||||||
|
parser.add_argument('url', help='企查查公司页面URL')
|
||||||
|
parser.add_argument('--headless', action='store_true', help='无头模式运行')
|
||||||
|
parser.add_argument('--cookie-file', default='qcc_cookies.txt', help='cookies文件路径')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
# 启动浏览器
|
||||||
|
browser = p.chromium.launch(headless=args.headless)
|
||||||
|
context = browser.new_context()
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 尝试加载本地保存的cookies
|
||||||
|
if load_cookies(context, args.cookie_file):
|
||||||
|
print("使用已保存的登录信息")
|
||||||
|
|
||||||
|
# 访问指定URL
|
||||||
|
page.goto(args.url)
|
||||||
|
|
||||||
|
# 检查是否跳转到了登录页面
|
||||||
|
if "weblogin" in page.url:
|
||||||
|
wait_for_login(page, args.cookie_file)
|
||||||
|
else:
|
||||||
|
print("已登录或无需登录")
|
||||||
|
|
||||||
|
# 重新访问目标URL(确保页面正确加载)
|
||||||
|
page.goto(args.url)
|
||||||
|
|
||||||
|
# 创建解析器并解析信息
|
||||||
|
parser = QCCParser(page)
|
||||||
|
company_info = parser.parse_company_info()
|
||||||
|
|
||||||
|
if company_info:
|
||||||
|
# 格式化输出JSON
|
||||||
|
print(json.dumps(company_info, ensure_ascii=False, indent=2))
|
||||||
|
else:
|
||||||
|
print("未能获取公司信息")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"发生错误: {e}")
|
||||||
|
finally:
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
# python qcc.py "https://www.qcc.com/firm/50b0e3189f2eb2b20304b255669ce1a1.html"
|
||||||
|
# # 首次运行需要扫码登录
|
||||||
|
# python qcc.py "https://www.qcc.com/firm/公司URL"
|
||||||
|
#
|
||||||
|
# # 后续运行将自动使用已保存的登录信息
|
||||||
|
# python qcc.py "https://www.qcc.com/firm/公司URL"
|
||||||
|
#
|
||||||
|
# # 指定自定义cookies文件
|
||||||
|
# python qcc.py --cookie-file my_cookies.txt "https://www.qcc.com/firm/公司URL"
|
||||||
831
company/youhou.js
Normal file
831
company/youhou.js
Normal file
@@ -0,0 +1,831 @@
|
|||||||
|
// ==UserScript==
|
||||||
|
// @name 爱企查&企查查等
|
||||||
|
// @namespace http://tampermonkey.net/
|
||||||
|
// @version 0.2
|
||||||
|
// @description 在页面右下角添加工具按钮,支持复制源码和解析公司信息
|
||||||
|
// @author You
|
||||||
|
// @match https://www.qcc.com/firm/*
|
||||||
|
// @match https://aiqicha.baidu.com/company_detail_*
|
||||||
|
// @grant none
|
||||||
|
// ==/UserScript==
|
||||||
|
|
||||||
|
(function () {
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
// 工具类 - 存放通用函数
|
||||||
|
class ToolUtils {
|
||||||
|
static cleanText(text) {
|
||||||
|
return text
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.replace(/[\r\n\t]/g, "")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
static extractText(doc, selectors) {
|
||||||
|
for (const selector of selectors) {
|
||||||
|
const element = doc.querySelector(selector);
|
||||||
|
if (element && element.textContent.trim()) {
|
||||||
|
return element.textContent.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
static copyToClipboard(content, successMessage) {
|
||||||
|
const textarea = document.createElement("textarea");
|
||||||
|
textarea.value = content;
|
||||||
|
textarea.style.position = "fixed";
|
||||||
|
textarea.style.top = "0";
|
||||||
|
textarea.style.left = "0";
|
||||||
|
textarea.style.width = "1px";
|
||||||
|
textarea.style.height = "1px";
|
||||||
|
textarea.style.opacity = "0";
|
||||||
|
|
||||||
|
document.body.appendChild(textarea);
|
||||||
|
textarea.select();
|
||||||
|
document.execCommand("copy");
|
||||||
|
document.body.removeChild(textarea);
|
||||||
|
|
||||||
|
if (successMessage) {
|
||||||
|
this.showAutoCloseMessage(successMessage, "success");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static showAutoCloseMessage(message, type = "info") {
|
||||||
|
// 创建一个自动关闭的提示框替代 alert
|
||||||
|
const alertBox = document.createElement("div");
|
||||||
|
alertBox.textContent = message;
|
||||||
|
alertBox.style.position = "fixed";
|
||||||
|
alertBox.style.top = "50%";
|
||||||
|
alertBox.style.left = "50%";
|
||||||
|
alertBox.style.transform = "translate(-50%, -50%)";
|
||||||
|
|
||||||
|
// 根据消息类型设置不同颜色
|
||||||
|
if (type === "success") {
|
||||||
|
alertBox.style.backgroundColor = "#52c41a";
|
||||||
|
} else if (type === "error") {
|
||||||
|
alertBox.style.backgroundColor = "#f5222d";
|
||||||
|
} else {
|
||||||
|
alertBox.style.backgroundColor = "#1890ff";
|
||||||
|
}
|
||||||
|
|
||||||
|
alertBox.style.color = "white";
|
||||||
|
alertBox.style.padding = "10px 20px";
|
||||||
|
alertBox.style.borderRadius = "4px";
|
||||||
|
alertBox.style.zIndex = "10001";
|
||||||
|
alertBox.style.boxShadow = "0 2px 8px rgba(0,0,0,0.15)";
|
||||||
|
alertBox.style.transition = "opacity 0.3s";
|
||||||
|
|
||||||
|
document.body.appendChild(alertBox);
|
||||||
|
|
||||||
|
// 2秒后自动关闭
|
||||||
|
setTimeout(() => {
|
||||||
|
if (document.body.contains(alertBox)) {
|
||||||
|
// 添加淡出效果
|
||||||
|
alertBox.style.opacity = "0";
|
||||||
|
// 真正移除元素
|
||||||
|
setTimeout(() => {
|
||||||
|
if (document.body.contains(alertBox)) {
|
||||||
|
document.body.removeChild(alertBox);
|
||||||
|
}
|
||||||
|
}, 300);
|
||||||
|
}
|
||||||
|
}, 2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
static showResult(data) {
|
||||||
|
const modal = document.createElement("div");
|
||||||
|
modal.style.position = "fixed";
|
||||||
|
modal.style.top = "50%";
|
||||||
|
modal.style.left = "50%";
|
||||||
|
modal.style.transform = "translate(-50%, -50%)";
|
||||||
|
modal.style.width = "600px";
|
||||||
|
modal.style.maxHeight = "80vh";
|
||||||
|
modal.style.overflowY = "auto";
|
||||||
|
modal.style.backgroundColor = "white";
|
||||||
|
modal.style.padding = "20px";
|
||||||
|
modal.style.boxShadow = "0 0 10px rgba(0,0,0,0.3)";
|
||||||
|
modal.style.zIndex = "10000";
|
||||||
|
|
||||||
|
const pre = document.createElement("pre");
|
||||||
|
pre.textContent = JSON.stringify(data, null, 2);
|
||||||
|
pre.style.whiteSpace = "pre-wrap";
|
||||||
|
pre.style.wordWrap = "break-word";
|
||||||
|
|
||||||
|
const copyBtn = document.createElement("button");
|
||||||
|
copyBtn.textContent = "复制JSON";
|
||||||
|
copyBtn.style.marginTop = "10px";
|
||||||
|
copyBtn.style.padding = "8px 16px";
|
||||||
|
copyBtn.style.backgroundColor = "#52c41a";
|
||||||
|
copyBtn.style.color = "white";
|
||||||
|
copyBtn.style.border = "none";
|
||||||
|
copyBtn.style.borderRadius = "4px";
|
||||||
|
copyBtn.style.cursor = "pointer";
|
||||||
|
|
||||||
|
copyBtn.addEventListener("click", () => {
|
||||||
|
navigator.clipboard
|
||||||
|
.writeText(JSON.stringify(data, null, 2))
|
||||||
|
.then(() => this.showAutoCloseMessage("已复制到剪贴板", "success"))
|
||||||
|
.catch((err) => alert("复制失败: " + err));
|
||||||
|
});
|
||||||
|
|
||||||
|
const closeBtn = document.createElement("button");
|
||||||
|
closeBtn.textContent = "关闭";
|
||||||
|
closeBtn.style.marginLeft = "10px";
|
||||||
|
closeBtn.style.marginTop = "10px";
|
||||||
|
closeBtn.style.padding = "8px 16px";
|
||||||
|
closeBtn.style.backgroundColor = "#f5222d";
|
||||||
|
closeBtn.style.color = "white";
|
||||||
|
closeBtn.style.border = "none";
|
||||||
|
closeBtn.style.borderRadius = "4px";
|
||||||
|
closeBtn.style.cursor = "pointer";
|
||||||
|
|
||||||
|
closeBtn.addEventListener("click", () => {
|
||||||
|
document.body.removeChild(modal);
|
||||||
|
});
|
||||||
|
|
||||||
|
modal.innerHTML = '<h2 style="margin-top: 0;">企业信息解析结果</h2>';
|
||||||
|
modal.appendChild(pre);
|
||||||
|
modal.appendChild(document.createElement("br"));
|
||||||
|
modal.appendChild(copyBtn);
|
||||||
|
modal.appendChild(closeBtn);
|
||||||
|
|
||||||
|
document.body.appendChild(modal);
|
||||||
|
// // Automatically close the modal after 2 seconds
|
||||||
|
// setTimeout(() => {
|
||||||
|
// if (document.body.contains(modal)) {
|
||||||
|
// document.body.removeChild(modal);
|
||||||
|
// }
|
||||||
|
// }, 2000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 爱企查解析类
|
||||||
|
class AiQiChaParser {
|
||||||
|
getPhoneNumber() {
|
||||||
|
// 查找电话信息容器
|
||||||
|
const phoneContainer = document.querySelector(
|
||||||
|
"div.business-info div.telphone-lists-wrap"
|
||||||
|
);
|
||||||
|
if (!phoneContainer) return "未找到电话信息";
|
||||||
|
|
||||||
|
// 查找包含电话号码的元素
|
||||||
|
const phoneElement = phoneContainer.querySelector("span.copy-box span");
|
||||||
|
if (!phoneElement) return "未找到电话号码";
|
||||||
|
|
||||||
|
return ToolUtils.cleanText(phoneElement.textContent);
|
||||||
|
}
|
||||||
|
constructor() {
|
||||||
|
this.table = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 初始化表格
|
||||||
|
initTable() {
|
||||||
|
this.table = document.querySelector("table.zx-detail-basic-table");
|
||||||
|
if (!this.table) {
|
||||||
|
alert("未找到企业信息表格");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取优化后的值
|
||||||
|
getOptimizedValue(title) {
|
||||||
|
const cells = Array.from(this.table.querySelectorAll("td"));
|
||||||
|
const titleCell = cells.find(
|
||||||
|
(cell) => ToolUtils.cleanText(cell.textContent) === title
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!titleCell) return null;
|
||||||
|
|
||||||
|
let valueCell = titleCell.nextElementSibling;
|
||||||
|
if (!valueCell) return null;
|
||||||
|
|
||||||
|
const valueElement =
|
||||||
|
valueCell.querySelector(".enter-bg-ele") ||
|
||||||
|
valueCell.querySelector(".addr-enter-bg-ele") ||
|
||||||
|
valueCell;
|
||||||
|
|
||||||
|
return ToolUtils.cleanText(valueElement.textContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取法定代表人
|
||||||
|
getLegalRepresentative() {
|
||||||
|
const legalElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => ToolUtils.cleanText(td.textContent) === "法定代表人");
|
||||||
|
|
||||||
|
if (legalElements.length > 0) {
|
||||||
|
const valueCell = legalElements[0].nextElementSibling;
|
||||||
|
if (valueCell && valueCell.classList.contains("image-text-content")) {
|
||||||
|
const nameElement = valueCell.querySelector(".person-name-warp a");
|
||||||
|
if (nameElement) {
|
||||||
|
return ToolUtils.cleanText(nameElement.textContent);
|
||||||
|
}
|
||||||
|
return ToolUtils.cleanText(valueCell.textContent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => td.textContent.includes("法定代表人"));
|
||||||
|
|
||||||
|
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||||
|
const valueCell = titleElements[0].nextElementSibling;
|
||||||
|
return ToolUtils.cleanText(valueCell.textContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取统一社会信用代码
|
||||||
|
getUnifiedSocialCreditCode() {
|
||||||
|
const codeElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||||
|
(td) => {
|
||||||
|
return (
|
||||||
|
td.textContent.includes("统一社会信用代码") &&
|
||||||
|
td.nextElementSibling &&
|
||||||
|
td.nextElementSibling.classList.contains("table-regCapital-lable")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (codeElements.length > 0) {
|
||||||
|
const valueCell = codeElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||||
|
(td) => td.textContent.includes("纳税人识别号")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
|
||||||
|
const valueCell = taxElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取工商注册号
|
||||||
|
getBusinessRegistrationNo() {
|
||||||
|
const regElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||||
|
(td) => ToolUtils.cleanText(td.textContent).includes("工商注册号")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (regElements.length > 0 && regElements[0].nextElementSibling) {
|
||||||
|
const valueCell = regElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取组织机构代码
|
||||||
|
getOrganizationCode() {
|
||||||
|
const orgCodeElements = Array.from(
|
||||||
|
this.table.querySelectorAll(".poptip-wrap-org-no")
|
||||||
|
).filter((el) => el.textContent.includes("组织机构代码"));
|
||||||
|
|
||||||
|
if (orgCodeElements.length > 0) {
|
||||||
|
const valueCell = orgCodeElements[0].closest("td").nextElementSibling;
|
||||||
|
if (valueCell && valueCell.classList.contains("enter-bg")) {
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => ToolUtils.cleanText(td.textContent) === "组织机构代码");
|
||||||
|
|
||||||
|
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||||
|
const valueCell = titleElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取纳税人识别号
|
||||||
|
getTaxpayerId() {
|
||||||
|
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
|
||||||
|
(td) => ToolUtils.cleanText(td.textContent).includes("纳税人识别号")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
|
||||||
|
const valueCell = taxElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
const creditElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) =>
|
||||||
|
ToolUtils.cleanText(td.textContent).includes("统一社会信用代码")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (creditElements.length > 0 && creditElements[0].nextElementSibling) {
|
||||||
|
const valueCell = creditElements[0].nextElementSibling;
|
||||||
|
const rawValue =
|
||||||
|
valueCell.querySelector(".enter-bg-ele")?.textContent ||
|
||||||
|
valueCell.textContent;
|
||||||
|
return ToolUtils.cleanText(rawValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取参保人数
|
||||||
|
getInsuranceNumber() {
|
||||||
|
const insuranceElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => {
|
||||||
|
return (
|
||||||
|
td.textContent.includes("参保人数") &&
|
||||||
|
td.querySelector(".insurance-info")
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (insuranceElements.length > 0) {
|
||||||
|
const valueCell = insuranceElements[0].nextElementSibling;
|
||||||
|
if (!valueCell) return null;
|
||||||
|
|
||||||
|
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
|
||||||
|
const match = rawText.match(/(\d+人)/);
|
||||||
|
return match ? match[0] : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const registrationElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => td.textContent.includes("登记机关"));
|
||||||
|
|
||||||
|
if (
|
||||||
|
registrationElements.length > 0 &&
|
||||||
|
registrationElements[0].previousElementSibling
|
||||||
|
) {
|
||||||
|
const valueCell = registrationElements[0].previousElementSibling;
|
||||||
|
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
|
||||||
|
const match = rawText.match(/(\d+人)/);
|
||||||
|
return match ? match[0] : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取核准日期
|
||||||
|
getApprovalDate() {
|
||||||
|
const approvalElements = Array.from(
|
||||||
|
this.table.querySelectorAll(".poptip-wrap-annual-date")
|
||||||
|
).filter((el) => el.textContent.includes("核准日期"));
|
||||||
|
|
||||||
|
if (approvalElements.length > 0) {
|
||||||
|
const valueCell = approvalElements[0].closest("td").nextElementSibling;
|
||||||
|
if (valueCell) {
|
||||||
|
const rawValue = valueCell.textContent
|
||||||
|
.replace(/[\r\n\t]/g, "")
|
||||||
|
.trim();
|
||||||
|
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
|
||||||
|
return rawValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleElements = Array.from(
|
||||||
|
this.table.querySelectorAll("td")
|
||||||
|
).filter((td) => ToolUtils.cleanText(td.textContent) === "核准日期");
|
||||||
|
|
||||||
|
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
|
||||||
|
const valueCell = titleElements[0].nextElementSibling;
|
||||||
|
const rawValue = ToolUtils.cleanText(valueCell.textContent);
|
||||||
|
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
|
||||||
|
return rawValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析公司信息主方法
|
||||||
|
parseCompanyInfo() {
|
||||||
|
if (!this.initTable()) return;
|
||||||
|
|
||||||
|
const companyData = {
|
||||||
|
企业名称: this.getOptimizedValue("企业名称"),
|
||||||
|
统一社会信用代码: this.getUnifiedSocialCreditCode(),
|
||||||
|
法定代表人: this.getLegalRepresentative(),
|
||||||
|
电话: this.getPhoneNumber(),
|
||||||
|
经营状态: this.getOptimizedValue("经营状态"),
|
||||||
|
成立日期: this.getOptimizedValue("成立日期"),
|
||||||
|
行政区划: this.getOptimizedValue("行政区划"),
|
||||||
|
注册资本: this.getOptimizedValue("注册资本"),
|
||||||
|
实缴资本: this.getOptimizedValue("实缴资本"),
|
||||||
|
企业类型: this.getOptimizedValue("企业类型"),
|
||||||
|
所属行业: this.getOptimizedValue("所属行业"),
|
||||||
|
工商注册号: this.getBusinessRegistrationNo(),
|
||||||
|
组织机构代码: this.getOrganizationCode(),
|
||||||
|
纳税人识别号: this.getTaxpayerId(),
|
||||||
|
纳税人资质: this.getOptimizedValue("纳税人资质"),
|
||||||
|
营业期限: this.getOptimizedValue("营业期限"),
|
||||||
|
核准日期: this.getApprovalDate(),
|
||||||
|
参保人数: this.getInsuranceNumber(),
|
||||||
|
登记机关: this.getOptimizedValue("登记机关"),
|
||||||
|
曾用名: this.getOptimizedValue("曾用名"),
|
||||||
|
注册地址: this.getOptimizedValue("注册地址"),
|
||||||
|
经营范围: this.getOptimizedValue("经营范围"),
|
||||||
|
};
|
||||||
|
|
||||||
|
ToolUtils.showResult(companyData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// QCC解析类 企查查
|
||||||
|
class QCCParser {
|
||||||
|
constructor() {
|
||||||
|
this.table = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 初始化表格
|
||||||
|
initTable() {
|
||||||
|
const cominfoNormal = document.querySelector("div.cominfo-normal");
|
||||||
|
if (!cominfoNormal) {
|
||||||
|
alert("未找到企业信息容器");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.table = cominfoNormal.querySelector("table.ntable");
|
||||||
|
if (!this.table) {
|
||||||
|
alert("未找到企业信息表格");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取优化后的值
|
||||||
|
getOptimizedValue(title) {
|
||||||
|
const headerCells = Array.from(this.table.querySelectorAll("td.tb"));
|
||||||
|
let value = null;
|
||||||
|
|
||||||
|
headerCells.forEach((header) => {
|
||||||
|
if (ToolUtils.cleanText(header.textContent).includes(title)) {
|
||||||
|
const valueCell = header.nextElementSibling;
|
||||||
|
if (valueCell) {
|
||||||
|
// 尝试从copy-value类中获取值
|
||||||
|
const copyValue = valueCell.querySelector(".copy-value");
|
||||||
|
if (copyValue) {
|
||||||
|
value = ToolUtils.cleanText(copyValue.textContent);
|
||||||
|
} else {
|
||||||
|
value = ToolUtils.cleanText(valueCell.textContent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取法定代表人
|
||||||
|
getLegalRepresentative() {
|
||||||
|
// Try the existing method first
|
||||||
|
const basicValue =
|
||||||
|
this.getOptimizedValue("法定代表人") || this.getOptimizedValue("法人");
|
||||||
|
if (basicValue && basicValue.trim()) {
|
||||||
|
// Remove any "关联企业 X" text
|
||||||
|
return basicValue.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If basic extraction fails, try more specific approach
|
||||||
|
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
|
||||||
|
(cell) => ToolUtils.cleanText(cell.textContent).includes("法定代表人")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!headerCell) return null;
|
||||||
|
|
||||||
|
const valueCell = headerCell.nextElementSibling;
|
||||||
|
if (!valueCell) return null;
|
||||||
|
|
||||||
|
// Try to find the name within the complex structure
|
||||||
|
// Look for anchor tags with target="_blank" which typically contain the legal representative's name
|
||||||
|
const nameLinks = valueCell.querySelectorAll('a[target="_blank"]');
|
||||||
|
for (const link of nameLinks) {
|
||||||
|
const name = ToolUtils.cleanText(link.textContent);
|
||||||
|
// Make sure it's not empty and doesn't contain obvious non-name text
|
||||||
|
if (name && !name.includes("关联企业") && !name.includes("复制")) {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Alternative approach - look for the first anchor tag in the cell
|
||||||
|
const firstLink = valueCell.querySelector("a");
|
||||||
|
if (firstLink) {
|
||||||
|
const name = ToolUtils.cleanText(firstLink.textContent);
|
||||||
|
// Remove any trailing "关联企业 X" text
|
||||||
|
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to general value extraction
|
||||||
|
const copyValue = valueCell.querySelector(".copy-value");
|
||||||
|
if (copyValue) {
|
||||||
|
const name = ToolUtils.cleanText(copyValue.textContent);
|
||||||
|
// Remove any trailing "关联企业 X" text
|
||||||
|
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const rawText = ToolUtils.cleanText(valueCell.textContent);
|
||||||
|
// Remove any trailing "关联企业 X" text
|
||||||
|
return rawText.replace(/\s*关联企业\s*\d+$/, "").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取统一社会信用代码
|
||||||
|
getUnifiedSocialCreditCode() {
|
||||||
|
return (
|
||||||
|
this.getOptimizedValue("统一社会信用代码") ||
|
||||||
|
this.getOptimizedValue("信用代码")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取工商注册号
|
||||||
|
getBusinessRegistrationNo() {
|
||||||
|
return (
|
||||||
|
this.getOptimizedValue("工商注册号") || this.getOptimizedValue("注册号")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取组织机构代码
|
||||||
|
getOrganizationCode() {
|
||||||
|
return this.getOptimizedValue("组织机构代码");
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取纳税人识别号
|
||||||
|
getTaxpayerId() {
|
||||||
|
return (
|
||||||
|
this.getOptimizedValue("纳税人识别号") ||
|
||||||
|
this.getUnifiedSocialCreditCode()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取参保人数
|
||||||
|
getInsuranceNumber() {
|
||||||
|
// 查找参保人数表头
|
||||||
|
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
|
||||||
|
(cell) => ToolUtils.cleanText(cell.textContent).includes("参保人数")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!headerCell) return null;
|
||||||
|
|
||||||
|
const valueCell = headerCell.nextElementSibling;
|
||||||
|
if (!valueCell) return null;
|
||||||
|
|
||||||
|
// 提取参保人数数字
|
||||||
|
const numberSpan = valueCell.querySelector("span");
|
||||||
|
const number = numberSpan
|
||||||
|
? ToolUtils.cleanText(numberSpan.textContent)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
// 提取年报年份
|
||||||
|
const reportLink = valueCell.querySelector("a.m-l-r-10");
|
||||||
|
const reportYear = reportLink
|
||||||
|
? ToolUtils.cleanText(reportLink.textContent)
|
||||||
|
: "";
|
||||||
|
|
||||||
|
// 组合结果
|
||||||
|
return number ? `${number}人 ${reportYear}` : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取联系电话
|
||||||
|
getPhoneNumber() {
|
||||||
|
// 查找联系信息容器
|
||||||
|
const contactInfo = document.querySelector("div.contact-info");
|
||||||
|
if (!contactInfo) return null;
|
||||||
|
|
||||||
|
// 查找右侧信息区域
|
||||||
|
const rightPart = contactInfo.querySelector("div.main-part-item.right");
|
||||||
|
if (!rightPart) return null;
|
||||||
|
|
||||||
|
// 查找包含电话的行
|
||||||
|
const rows = Array.from(rightPart.querySelectorAll("div.rline"));
|
||||||
|
const phoneRow = rows.find((row) =>
|
||||||
|
ToolUtils.cleanText(row.textContent).includes("电话:")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!phoneRow) return null;
|
||||||
|
|
||||||
|
// 提取电话号码
|
||||||
|
const spans = Array.from(
|
||||||
|
phoneRow.querySelectorAll("span.need-copy-field")
|
||||||
|
);
|
||||||
|
const phoneSpan = spans.find(
|
||||||
|
(span) => !ToolUtils.cleanText(span.textContent).includes("电话:")
|
||||||
|
);
|
||||||
|
|
||||||
|
return phoneSpan ? ToolUtils.cleanText(phoneSpan.textContent) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取核准日期
|
||||||
|
getApprovalDate() {
|
||||||
|
return (
|
||||||
|
this.getOptimizedValue("核准日期") || this.getOptimizedValue("成立日期")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 解析公司信息主方法
|
||||||
|
parseCompanyInfo() {
|
||||||
|
if (!this.initTable()) return;
|
||||||
|
|
||||||
|
const companyData = {
|
||||||
|
企业名称:
|
||||||
|
this.getOptimizedValue("企业名称") ||
|
||||||
|
this.getOptimizedValue("公司名称"),
|
||||||
|
统一社会信用代码: this.getUnifiedSocialCreditCode(),
|
||||||
|
法定代表人: this.getLegalRepresentative(),
|
||||||
|
经营状态: this.getOptimizedValue("登记状态"),
|
||||||
|
成立日期: this.getOptimizedValue("成立日期"),
|
||||||
|
行政区划: this.getOptimizedValue("行政区划"),
|
||||||
|
注册资本: this.getOptimizedValue("注册资本"),
|
||||||
|
实缴资本: this.getOptimizedValue("实缴资本"),
|
||||||
|
企业类型: this.getOptimizedValue("企业类型"),
|
||||||
|
所属行业: this.getOptimizedValue("国标行业"),
|
||||||
|
工商注册号: this.getBusinessRegistrationNo(),
|
||||||
|
组织机构代码: this.getOrganizationCode(),
|
||||||
|
纳税人识别号: this.getTaxpayerId(),
|
||||||
|
纳税人资质: this.getOptimizedValue("纳税人资质"),
|
||||||
|
营业期限: this.getOptimizedValue("营业期限"),
|
||||||
|
核准日期: this.getApprovalDate(),
|
||||||
|
参保人数: this.getInsuranceNumber(),
|
||||||
|
电话: this.getPhoneNumber(),
|
||||||
|
登记机关: this.getOptimizedValue("登记机关"),
|
||||||
|
曾用名: this.getOptimizedValue("曾用名"),
|
||||||
|
注册地址: this.getOptimizedValue("注册地址"),
|
||||||
|
经营范围: this.getOptimizedValue("经营范围"),
|
||||||
|
};
|
||||||
|
|
||||||
|
ToolUtils.showResult(companyData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 创建按钮容器
|
||||||
|
function createButtonContainer() {
|
||||||
|
const container = document.createElement("div");
|
||||||
|
container.id = "tool-container";
|
||||||
|
Object.assign(container.style, {
|
||||||
|
position: "fixed",
|
||||||
|
right: "20px",
|
||||||
|
bottom: "20px",
|
||||||
|
zIndex: "9999",
|
||||||
|
display: "flex",
|
||||||
|
flexDirection: "column",
|
||||||
|
gap: "10px",
|
||||||
|
width: "40px",
|
||||||
|
height: "40px",
|
||||||
|
backgroundColor: "#4CAF50",
|
||||||
|
borderRadius: "50%",
|
||||||
|
transition: "all 0.3s ease",
|
||||||
|
overflow: "hidden",
|
||||||
|
cursor: "move",
|
||||||
|
});
|
||||||
|
|
||||||
|
// +号指示器
|
||||||
|
const plusSign = document.createElement("div");
|
||||||
|
plusSign.textContent = "+";
|
||||||
|
Object.assign(plusSign.style, {
|
||||||
|
color: "white",
|
||||||
|
fontSize: "24px",
|
||||||
|
textAlign: "center",
|
||||||
|
lineHeight: "40px",
|
||||||
|
width: "100%",
|
||||||
|
});
|
||||||
|
container.appendChild(plusSign);
|
||||||
|
|
||||||
|
// 悬停展开效果
|
||||||
|
container.addEventListener("mouseenter", () => {
|
||||||
|
container.style.width = "150px";
|
||||||
|
container.style.height = "auto";
|
||||||
|
container.style.borderRadius = "8px";
|
||||||
|
});
|
||||||
|
|
||||||
|
container.addEventListener("mouseleave", () => {
|
||||||
|
container.style.width = "40px";
|
||||||
|
container.style.height = "40px";
|
||||||
|
container.style.borderRadius = "50%";
|
||||||
|
});
|
||||||
|
|
||||||
|
// 添加拖动功能
|
||||||
|
let isDragging = false;
|
||||||
|
let offsetX, offsetY;
|
||||||
|
|
||||||
|
// 鼠标按下开始拖动
|
||||||
|
container.addEventListener("mousedown", (e) => {
|
||||||
|
// 只有点击+号区域才允许拖动
|
||||||
|
if (e.target === plusSign || e.target === container) {
|
||||||
|
isDragging = true;
|
||||||
|
const rect = container.getBoundingClientRect();
|
||||||
|
offsetX = e.clientX - rect.left;
|
||||||
|
offsetY = e.clientY - rect.top;
|
||||||
|
container.style.cursor = "grabbing";
|
||||||
|
// 阻止事件冒泡和默认行为
|
||||||
|
e.stopPropagation();
|
||||||
|
e.preventDefault();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 鼠标移动时更新位置
|
||||||
|
document.addEventListener("mousemove", (e) => {
|
||||||
|
if (!isDragging) return;
|
||||||
|
container.style.left = e.clientX - offsetX + "px";
|
||||||
|
container.style.top = e.clientY - offsetY + "px";
|
||||||
|
container.style.right = "auto";
|
||||||
|
container.style.bottom = "auto";
|
||||||
|
});
|
||||||
|
|
||||||
|
// 鼠标释放结束拖动
|
||||||
|
document.addEventListener("mouseup", () => {
|
||||||
|
if (isDragging) {
|
||||||
|
isDragging = false;
|
||||||
|
container.style.cursor = "move";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// 创建功能按钮
|
||||||
|
function createButton(text, onClick) {
|
||||||
|
const button = document.createElement("button");
|
||||||
|
button.textContent = text;
|
||||||
|
Object.assign(button.style, {
|
||||||
|
padding: "8px 12px",
|
||||||
|
border: "none",
|
||||||
|
borderRadius: "4px",
|
||||||
|
backgroundColor: "white",
|
||||||
|
color: "#333",
|
||||||
|
cursor: "pointer",
|
||||||
|
width: "100%",
|
||||||
|
transition: "backgroundColor 0.2s",
|
||||||
|
});
|
||||||
|
button.addEventListener(
|
||||||
|
"mouseenter",
|
||||||
|
() => (button.style.backgroundColor = "#f0f0f0")
|
||||||
|
);
|
||||||
|
button.addEventListener(
|
||||||
|
"mouseleave",
|
||||||
|
() => (button.style.backgroundColor = "white")
|
||||||
|
);
|
||||||
|
button.addEventListener("click", onClick);
|
||||||
|
return button;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 复制源码按钮
|
||||||
|
const copySourceButton = createButton("复制源码", () => {
|
||||||
|
const html = document.documentElement.outerHTML;
|
||||||
|
|
||||||
|
copyToClipboard(html, "HTML源码已复制到剪贴板");
|
||||||
|
/*
|
||||||
|
navigator.clipboard
|
||||||
|
.writeText(html)
|
||||||
|
.then(() => {
|
||||||
|
alert("源码已复制到剪贴板");
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error("复制失败:", err);
|
||||||
|
});
|
||||||
|
*/
|
||||||
|
});
|
||||||
|
|
||||||
|
// 解析公司信息按钮
|
||||||
|
const parseInfoButton = createButton("解析公司信息", () => {
|
||||||
|
// 根据当前URL选择对应的解析器
|
||||||
|
let parser;
|
||||||
|
if (window.location.host.includes("aiqicha.baidu.com")) {
|
||||||
|
parser = new AiQiChaParser();
|
||||||
|
} else if (window.location.host.includes("qcc.com")) {
|
||||||
|
parser = new QCCParser();
|
||||||
|
} else {
|
||||||
|
alert("不支持的网站");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
parser.parseCompanyInfo();
|
||||||
|
});
|
||||||
|
|
||||||
|
// 添加按钮到容器
|
||||||
|
container.appendChild(copySourceButton);
|
||||||
|
container.appendChild(parseInfoButton);
|
||||||
|
|
||||||
|
document.body.appendChild(container);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 页面加载完成后创建按钮
|
||||||
|
window.addEventListener("load", createButtonContainer);
|
||||||
|
})();
|
||||||
23
config.py
Normal file
23
config.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
|
||||||
|
|
||||||
|
# 最好写入三家搜索引擎登录后的cookie
|
||||||
|
bingheaders = {
|
||||||
|
'cookie': """""",
|
||||||
|
'referer': 'https://cn.bing.com/',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4051.0 Safari/537.36 Edg/82.0.425.0'}
|
||||||
|
|
||||||
|
baiduheaders = {
|
||||||
|
'Cookie': """PSTM=1755051967; BAIDUID=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; BIDUPSID=BFDEAE9917763352A1CF94FF7A9AD50F; BD_UPN=12314753; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; ZFY=LX6tLiXJLyE8Spg0Tn3yWYhYWOqUXgNuD45NXzSsgDY:C; baikeVisitId=6e4f6130-a8eb-49b3-8413-1815a6af31a3; BD_HOME=1; ppfuid=FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGlT/s3qQuIlmw0dmIvm22ZTGEimjy3MrXEpSuItnI4KDyGSNvJz3OVxhMd6l0BD7nHci+eNtO+sUfx41sINYk+w3il4JkBUe91yGyLjoc4piSRx4OH9u8PLj7EqnTyQEyOWgTqV0RFcOD/4ANUzZZkGhGlPjfasITJONp0AJTY8kGLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH0OjH2+RiDANKDxQc+RdNr2uC5D1fu00TizBtFeq9APvs5FjnYxYstXg/9EfB3EVmJIvdK3BvFGk0IgcgSSzt63lV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5gbcves5wJ6gbk0b0Avk9wGRtTVVEE/aHCSd+6WFfR1C5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKIUiL64VMhr/ZycHJ3jpdZlyprBJR80ygAVuGrjl4whGbgBRkDPTwtXjYtgzmW74m0fDU2MZaxpBZZF8YurfocYcmDdcxFKeoIFQmVqAoAU+3YcXQt2xKThZZyV1v3sCvnzidUZtKM9cRRUfRWBtQSb50APM+gs/408xg7KHCB8AOKpZpfIpPhQ0RJhew8GR0aTqYsJo1IRCwM3UbbrvtJ7eqPMNzJcGcSYcQWm1FubInMonve94c+p8Vi2wc72MfReeFiTzMp1G6pDt2e40gPDGbdQI+jba4UjRlyA+9CbTW6Mt45W/80hW/gFEKh9+Klyky6FPenbJgt/vQK9TAiTA==; BDUSS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; BDUSS_BFESS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22370464293%22%2C%22first_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%7D; MCITY=-179%3A; log_first_time=1755482524636; log_last_time=1755482544322; RT="z=1&dm=baidu.com&si=1403e7da-9af8-439d-bdca-61f492a1b52a&ss=mecm9ry0&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=jhi&ul=3upx87&hd=3upxa3"; H_PS_PSSID=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BA_HECTOR=048lak8h81218h8h8020850k80a00g1ka54mp25; H_WISE_SIDS=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; COOKIE_SESSION=21_0_8_9_13_23_0_1_8_9_1_6_498875_0_0_0_1754446941_0_1755485313%7C9%231543375_16_1753882701%7C7; H_PS_645EC=1275d4%2BgYNOGPU5%2Fgp6XcloUiDEOGWs8LNx7nISyDCmJSXMYxQLNnwJypIA""",
|
||||||
|
'Host': 'www.baidu.com',
|
||||||
|
'referer': 'https://www.baidu.com/s',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0'
|
||||||
|
}
|
||||||
|
googleheaders = {
|
||||||
|
'cookie': """""",
|
||||||
|
'referer': 'https://www.google.com/',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 爬取谷歌需要代理
|
||||||
|
proxy='http://127.0.0.1:7897'
|
||||||
|
|
||||||
69
data.csv
Normal file
69
data.csv
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
杭州辉煌物业管理有限公司
|
||||||
|
杭州辉望科技有限公司
|
||||||
|
浙江八方电信科技集团有限公司
|
||||||
|
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)
|
||||||
|
瑞凤九天(杭州)科技有限公司
|
||||||
|
金码智能科技(杭州)有限公司
|
||||||
|
OPPO广东移动通信有限公司
|
||||||
|
杭州巨量引擎网络技术有限公司
|
||||||
|
杭州绿城衡宇环境设计有限公司
|
||||||
|
新疆浙疆果业有限公司
|
||||||
|
杭州未知数品牌管理有限公司
|
||||||
|
杭州慈山科技有限公司
|
||||||
|
杭州扬拓体育科技有限公司
|
||||||
|
杭州乂游网络科技有限公司
|
||||||
|
杭州钱橙似锦科技有限公司
|
||||||
|
杭州奥得徕贸易有限公司
|
||||||
|
杭州伍壹荟旅游咨资询有限公司
|
||||||
|
杭州心满意定供应链服务有限公司
|
||||||
|
杭州麒晨科技有限公司
|
||||||
|
杭州羊咩咩文化传媒有限公司
|
||||||
|
杭州禾露则正生物科技有限公司
|
||||||
|
浙江商盟支付有限公司
|
||||||
|
天禄(杭州)科技有限公司
|
||||||
|
如是启创(杭州)科技有限公司
|
||||||
|
杭州音视贝科技有限公司
|
||||||
|
杭州千骏轴承有限公司
|
||||||
|
杭州锐擎科技有限公司
|
||||||
|
浙江力一科技有限公司
|
||||||
|
煜邦电力智能装备(嘉兴)有限公司
|
||||||
|
杭州裕阳经营管理合伙企业(有限合伙)
|
||||||
|
氧气.康复中心
|
||||||
|
杭州云迹物联科技有限公司
|
||||||
|
杭州着墨文化创意者限公司
|
||||||
|
亚信科技(南京)有限公司
|
||||||
|
杭州密尔沃智能装备有限公司
|
||||||
|
杭州骏远电子商务有限公司
|
||||||
|
杭州一喂智能科技有限公司
|
||||||
|
杭州孚伦特科技有限公司
|
||||||
|
杭州人谋天成科技有限公司
|
||||||
|
杭州瑾馨贸易集团有限公司
|
||||||
|
杭州琑为缘文化艺术有限公司
|
||||||
|
浙江丝里伯睡眠科技股份有限公司
|
||||||
|
杭州倍驰科技有限公司
|
||||||
|
杭州心灵部落教育有限公司(灵动生活)
|
||||||
|
杭州云印智造科技有限公司
|
||||||
|
浙江海拓环境技术有限公司
|
||||||
|
申能环境科技有限公司
|
||||||
|
医贝云服(杭州)科技有限公司
|
||||||
|
杭州甬盛通信技术有限公司
|
||||||
|
杭州字节跳动科技有限公司
|
||||||
|
杭州邻汇网络科技有限公司
|
||||||
|
浙江建盛安全科技有限公司
|
||||||
|
幻想集团·杭州运营中心
|
||||||
|
杭州阿克莱斯设备有限公司
|
||||||
|
浙江省现代农业促进会
|
||||||
|
益思芯科技(杭州)有限公司
|
||||||
|
杭州霖思网络科技有限公司
|
||||||
|
杭州星瀚知识产权代理有限公司
|
||||||
|
风华(杭州)信息技术有限公司
|
||||||
|
杭州晓羽科技有限公司
|
||||||
|
浙江根旺律师事务所
|
||||||
|
远大住宅工业(杭州)有限公司
|
||||||
|
浙江全应科技有限公司
|
||||||
|
杭州塞牧文化传媒有限公司
|
||||||
|
浙江彩屋信息技术有限公司
|
||||||
|
杭州瑞泡特教育科技有限公司
|
||||||
|
杭州贝享健康科技有限公司
|
||||||
|
杭州摸象大数据科技有限公司
|
||||||
|
杭州颐刻生物科技有限公司
|
||||||
|
1
freeze.bat
Normal file
1
freeze.bat
Normal file
@@ -0,0 +1 @@
|
|||||||
|
pip freeze > requirements.txt
|
||||||
2
install_requirements.bat
Normal file
2
install_requirements.bat
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
pip install -r requirements.txt
|
||||||
|
python.exe -m pip install --upgrade pip
|
||||||
138
main.py
Normal file
138
main.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import aiofiles
|
||||||
|
import urllib.parse
|
||||||
|
import argparse
|
||||||
|
from colorama import init, Fore
|
||||||
|
from search import Bing,Baidu
|
||||||
|
import openpyxl
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
from tool.read_csv import CSVReader
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
def printascii():
|
||||||
|
# 初始化
|
||||||
|
init()
|
||||||
|
# 设置颜色
|
||||||
|
print(Fore.GREEN + r'''
|
||||||
|
____ _
|
||||||
|
/ ___| ___ __ _ _ __ ___| |__ ___ _ __
|
||||||
|
\___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__|
|
||||||
|
___) | __/ (_| | | | (__| | | | __/ |
|
||||||
|
|____/ \___|\__,_|_| \___|_| |_|\___|_|
|
||||||
|
''' + Fore.RESET)
|
||||||
|
# 天欣安全实验室
|
||||||
|
|
||||||
|
|
||||||
|
def writeExcel(titles, links,ws):
|
||||||
|
infos = list(zip(titles, links))
|
||||||
|
for row in infos:
|
||||||
|
ws.append(row)
|
||||||
|
|
||||||
|
def create_sheet_and_write(wb, engine, keywords, num, title):
|
||||||
|
ws = wb.create_sheet(title=title)
|
||||||
|
result = engine(keywords, num)
|
||||||
|
writeExcel(result[0], result[1], ws)
|
||||||
|
def excel_text2url(link_url): #如果函数内部没有进行异步操作,使用 async 并不会对性能或功能产生实际影响。
|
||||||
|
'''把一个网址字符串转换为 Excel公式,使其可以点击直接转跳'''
|
||||||
|
return f'=HYPERLINK("{link_url}","{link_url}")'
|
||||||
|
# 遍历所有工作表,并将第二列的所有数据传递给 excel_text2url 函数重新赋值
|
||||||
|
def update_hyperlinks(wb):
|
||||||
|
for sheet in wb.worksheets: # 遍历每一个工作表
|
||||||
|
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=2, max_col=2): # 遍历第二列
|
||||||
|
for cell in row:
|
||||||
|
if cell.value: # 检查单元格是否有内容
|
||||||
|
cell.value = excel_text2url(cell.value) # 将网址转换为超链接公式
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
def commend():
|
||||||
|
parser = argparse.ArgumentParser(prog="Searcher", description='此工具用于对百度、必应和谷歌搜索的协程爬取--天欣安全实验室', usage='please read -h')
|
||||||
|
parser.add_argument("-k", type=str, help="搜索的关键词", nargs='+')
|
||||||
|
# 添加一个positional arguments,叫a,读取类型为int(默认是字符串)
|
||||||
|
parser.add_argument("-p", type=str, help="需要搜索页数,默认为5,支持范围搜索,例如搜索从第2页到第五页的参数为 2:5", default='5')
|
||||||
|
parser.add_argument("-m", type=str, help="使用的搜索引擎:百度:bd,必应:bin,谷歌:goo 不填写默认使用全部", default='all',nargs='+')
|
||||||
|
# parser.add_argument("-t", '--task', type=int, help="设置的线程,默认为8", default=8)
|
||||||
|
parser.exit_on_error = False
|
||||||
|
args = parser.parse_args()
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
printascii()
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit()
|
||||||
|
return args
|
||||||
|
def search_company_info(company_name_arg, num):
|
||||||
|
keywords = company_name_arg
|
||||||
|
# for key in keyword:
|
||||||
|
# keywords = keywords + key + " "
|
||||||
|
keywords = keywords.strip()
|
||||||
|
result = Bing.bing_main(keywords, num)
|
||||||
|
|
||||||
|
# for 循环 遍历 result[0] 和 result[1]
|
||||||
|
|
||||||
|
data_list =[]
|
||||||
|
for i in range(len(result[0])):
|
||||||
|
title= result[0][i]
|
||||||
|
url = result[1][i]
|
||||||
|
print(f"必应搜索爬取结果为,title:{title}, url:{url}")
|
||||||
|
if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url):
|
||||||
|
data_list.append([title, url])
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
def filter_company_sites(urls):
|
||||||
|
# urls https://www.tianyancha.com/company/5226478758
|
||||||
|
# url:https://aiqicha.baidu.com/company_detail_26602790857925
|
||||||
|
# url:https://www.qcc.com/firm/05b449eb5cc417d0f97c14104051f5c0.html
|
||||||
|
# 匹配 前缀https://aiqicha.baidu.com/company_detail_*,https://www.qcc.com/firm/*.html,https://www.tianyancha.com/company/5226478758*
|
||||||
|
filtered_urls = [url for url in urls if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url)]
|
||||||
|
return filtered_urls
|
||||||
|
|
||||||
|
|
||||||
|
def search_one_company(company_name_arg, num):
|
||||||
|
|
||||||
|
keywords = company_name_arg
|
||||||
|
# for key in keyword:
|
||||||
|
# keywords = keywords + key + " "
|
||||||
|
keywords = keywords.strip()
|
||||||
|
print(f"您搜索的关键词为:{keywords}")
|
||||||
|
wb = openpyxl.Workbook()
|
||||||
|
# 删除默认创建的工作表(现在名为 "数据表1")
|
||||||
|
wb.remove(wb['Sheet'])
|
||||||
|
printascii()
|
||||||
|
pattern = r"[\\/:\*\?\"<>|]"
|
||||||
|
keyword = re.sub(pattern, "", keywords)
|
||||||
|
create_sheet_and_write(wb, Bing.bing_main, keywords, num, "必应爬取结果")
|
||||||
|
create_sheet_and_write(wb, Baidu.baidu_main, keywords, num, "百度爬取结果")
|
||||||
|
# 将所有url变为超链接,点击即可打开转跳
|
||||||
|
update_hyperlinks(wb)
|
||||||
|
wb.save(f'./{keyword}-{company_name_arg}.xlsx')
|
||||||
|
print(Fore.GREEN + '总任务结束!' + Fore.RESET)
|
||||||
|
end = time.time()
|
||||||
|
print(Fore.RED + f'脚本总时间: {end - start:.2f}')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
reader = CSVReader('data.csv')
|
||||||
|
company_names = reader.read_column(0, has_header=False)
|
||||||
|
print("所有数据:", company_names)
|
||||||
|
|
||||||
|
i= 1
|
||||||
|
for company_name in company_names:
|
||||||
|
sleep_time = 5
|
||||||
|
sleep_time += random.randint(1, 5)
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
company_name += " 爱企查|企查查"
|
||||||
|
data_list = search_company_info(company_name, '1')
|
||||||
|
print(data_list)
|
||||||
|
i=i+1
|
||||||
|
if i > 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
20
requirements.txt
Normal file
20
requirements.txt
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
aiofiles==24.1.0
|
||||||
|
aiohappyeyeballs==2.4.0
|
||||||
|
aiohttp==3.10.5
|
||||||
|
aiosignal==1.3.1
|
||||||
|
async-timeout==4.0.3
|
||||||
|
attrs==24.2.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
colorama==0.4.6
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
frozenlist==1.4.1
|
||||||
|
greenlet==3.2.4
|
||||||
|
idna==3.8
|
||||||
|
lxml==5.3.0
|
||||||
|
multidict==6.1.0
|
||||||
|
openpyxl==3.1.5
|
||||||
|
playwright==1.55.0
|
||||||
|
pyee==13.0.0
|
||||||
|
soupsieve==2.6
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
yarl==1.11.1
|
||||||
128
search/Baidu.py
Normal file
128
search/Baidu.py
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import aiofiles
|
||||||
|
import urllib.parse
|
||||||
|
import argparse
|
||||||
|
from colorama import init, Fore
|
||||||
|
import ssl
|
||||||
|
from urllib.parse import quote
|
||||||
|
# 添加项目根目录到 sys.path
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
import config
|
||||||
|
baiduheaders=config.baiduheaders
|
||||||
|
|
||||||
|
timeout = aiohttp.ClientTimeout(
|
||||||
|
total=None, # 总超时
|
||||||
|
sock_connect=5.5, # 连接超时时间5.5
|
||||||
|
sock_read=5.5 # 读取超时为5.5秒
|
||||||
|
)
|
||||||
|
#--天欣安全实验室--#
|
||||||
|
|
||||||
|
# 初次请求获取百度加密后的url
|
||||||
|
async def getfirstinfo(keyword, pn,session):
|
||||||
|
sslcontext = ssl.create_default_context()
|
||||||
|
sslcontext.check_hostname = False
|
||||||
|
sslcontext.verify_mode = ssl.CERT_NONE
|
||||||
|
titlelist = []
|
||||||
|
fakeurl = []
|
||||||
|
url = f'https://www.baidu.com/s?wd={keyword}&pn={pn}'
|
||||||
|
# print("正在爬取的url为:"+url)
|
||||||
|
j=0
|
||||||
|
while j<3:
|
||||||
|
try:
|
||||||
|
async with session.get(url, headers=baiduheaders, ssl=sslcontext,timeout=timeout) as resp:
|
||||||
|
html = await resp.text()
|
||||||
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
|
h3 = soup.select('h3.t')
|
||||||
|
for h3 in h3:
|
||||||
|
h3text = h3.text.replace('\n', '').replace(',', ' ').replace('\ue636', '').strip()
|
||||||
|
titlelist.append(h3text) #保存h3标签内的文字内容
|
||||||
|
fakeurl.append(h3.a.get('href')) #获取h3下a标签的href链接,此链接为百度跳转链接,需要处理
|
||||||
|
return titlelist, fakeurl
|
||||||
|
except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
print("baidu链接失败,正在重新尝试...")
|
||||||
|
j=j+1
|
||||||
|
print(f"百度任务出错:{url}该url无法正常获取数据。")
|
||||||
|
return [],[]
|
||||||
|
|
||||||
|
# 再次请求获取真实的网站url
|
||||||
|
async def gettrueurl(url,printtitle,session):
|
||||||
|
try:
|
||||||
|
domain = 'https://www.baidu.com/'
|
||||||
|
# async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url, headers=baiduheaders, allow_redirects=False) as resp:
|
||||||
|
await resp.text()
|
||||||
|
if str(resp.headers.get('Location')) != None and str(resp.headers.get('Location')) != '':
|
||||||
|
trueurl=str(resp.headers.get('Location'))
|
||||||
|
print(printtitle," ",trueurl)
|
||||||
|
return trueurl
|
||||||
|
else:
|
||||||
|
print(url + '该url无法转跳')
|
||||||
|
url = urllib.parse.urljoin(domain, url)
|
||||||
|
print(printtitle, " ",url)
|
||||||
|
return url
|
||||||
|
except:
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def baidu_spinder(keyword, num):
|
||||||
|
print(f'百度爬取任务进行中,爬取页数为{num}...')
|
||||||
|
urllist = []
|
||||||
|
titlelist = []
|
||||||
|
tasks1 = []
|
||||||
|
tasks2 = []
|
||||||
|
Source = []
|
||||||
|
if ':' in num:
|
||||||
|
if num.count(':') > 1:
|
||||||
|
raise ValueError("输入中必须且只能包含一个 ':'")
|
||||||
|
else:
|
||||||
|
# 分割字符串,确保分割后的两部分都是数字
|
||||||
|
start_page, end_page = num.split(':')
|
||||||
|
# 判断两边是否都是数字
|
||||||
|
if not (start_page.isdigit() and end_page.isdigit()):
|
||||||
|
raise ValueError("':' 两侧的值必须是数字")
|
||||||
|
else:
|
||||||
|
start_page = (int(start_page) - 1) * 10
|
||||||
|
end_page = (int(end_page)) * 10
|
||||||
|
else:
|
||||||
|
start_page, end_page = 0, int(num) * 10
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
for i, pn in enumerate(range(start_page, end_page, 10)):
|
||||||
|
tasks1 = tasks1 + [asyncio.create_task(getfirstinfo(keyword, pn,session))]
|
||||||
|
result = await asyncio.gather(*tasks1)
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
for i in range(int((end_page-start_page) / 10)):
|
||||||
|
titlelist += result[i][0]
|
||||||
|
for j,url in enumerate(result[i][1]):
|
||||||
|
printtitle=result[i][0][j]
|
||||||
|
if not url.startswith(('http://', 'https://')):
|
||||||
|
domain = 'http://www.baidu.com/'
|
||||||
|
url = urllib.parse.urljoin(domain, url)
|
||||||
|
tasks2 = tasks2 + [asyncio.create_task(gettrueurl(url,printtitle,session))]
|
||||||
|
print('标题\t URL\t')
|
||||||
|
urllist += await asyncio.gather(*tasks2)
|
||||||
|
count = len(urllist)
|
||||||
|
print(f"百度搜索爬取结果数量为{count}")
|
||||||
|
print(Fore.GREEN + '百度爬取任务完成!\n' + Fore.RESET)
|
||||||
|
return titlelist, urllist
|
||||||
|
# await baiduwriteCSV(titlelist, urllist, keyword)
|
||||||
|
|
||||||
|
|
||||||
|
def baidu_main(keyword, num):
|
||||||
|
keyword = quote(keyword)
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return loop.run_until_complete(baidu_spinder(keyword, num))
|
||||||
|
|
||||||
|
async def Baidu_main(keywords, num):
|
||||||
|
return await baidu_spinder(keywords, num)
|
||||||
90
search/Bing.py
Normal file
90
search/Bing.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import urllib.parse
|
||||||
|
from urllib.parse import quote
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from colorama import Fore
|
||||||
|
|
||||||
|
# 添加项目根目录到 sys.path
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
import config
|
||||||
|
bingheaders=config.bingheaders
|
||||||
|
proxy=config.proxy
|
||||||
|
timeout = aiohttp.ClientTimeout(
|
||||||
|
total=None, # 总超时
|
||||||
|
sock_connect=5.5, # 连接超时时间5.5
|
||||||
|
sock_read=5.5 # 读取超时为5.5秒
|
||||||
|
)
|
||||||
|
async def getbing(url, session):
|
||||||
|
url_list = []
|
||||||
|
title_list = []
|
||||||
|
async with session.get(url, headers=bingheaders,timeout=timeout) as resp:
|
||||||
|
# print("正在爬取url:"+url)
|
||||||
|
try:
|
||||||
|
a = await resp.text()
|
||||||
|
soup = BeautifulSoup(a, 'lxml')
|
||||||
|
h2a = soup.select('h2 a')
|
||||||
|
for h in h2a:
|
||||||
|
htext = h.text.replace('\n', '').replace(',', ' ').strip()
|
||||||
|
hurl=h.get('href')
|
||||||
|
if not hurl.startswith(('http://', 'https://')):
|
||||||
|
domain = 'https://cn.bing.com/'
|
||||||
|
hurl = urllib.parse.urljoin(domain, hurl)
|
||||||
|
print(htext," ",hurl)
|
||||||
|
title_list.append(htext)
|
||||||
|
url_list.append(hurl)
|
||||||
|
except:
|
||||||
|
print(f"必应页面爬取失败,{url}该url无法正常获取数据。")
|
||||||
|
return [],[]
|
||||||
|
return url_list, title_list
|
||||||
|
|
||||||
|
|
||||||
|
async def bing_spinder(keyword, num):
|
||||||
|
print(f'必应爬取任务进行中,爬取页数为{num}...')
|
||||||
|
print('标题 url')
|
||||||
|
urllist = []
|
||||||
|
titlelist = []
|
||||||
|
tasks = []
|
||||||
|
if ':' in num:
|
||||||
|
if num.count(':') > 1:
|
||||||
|
raise ValueError("输入中必须且只能包含一个 ':'")
|
||||||
|
else:
|
||||||
|
# 分割字符串,确保分割后的两部分都是数字
|
||||||
|
start_page, end_page = num.split(':')
|
||||||
|
# 判断两边是否都是数字
|
||||||
|
if not (start_page.isdigit() and end_page.isdigit()):
|
||||||
|
raise ValueError("':' 两侧的值必须是数字")
|
||||||
|
else:
|
||||||
|
start_page = (int(start_page)-1)*10
|
||||||
|
end_page = (int(end_page))*10
|
||||||
|
else:
|
||||||
|
start_page, end_page =0,int(num) * 10
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
for pn in range(start_page, end_page, 10):
|
||||||
|
#url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN'
|
||||||
|
url = f'https://cn.bing.com/search?q={keyword}&qs=n&form=QBRE&sp=-1&lq=0'
|
||||||
|
# print("正在爬取的url为:"+url)
|
||||||
|
tasks = tasks + [asyncio.create_task(getbing(url, session))]
|
||||||
|
result = await asyncio.gather(*tasks)
|
||||||
|
for i in range(int((end_page-start_page) / 10)):
|
||||||
|
urllist += result[i][0]
|
||||||
|
titlelist += result[i][1]
|
||||||
|
count=len(urllist)
|
||||||
|
print(f"必应搜索爬取结果为{count}")
|
||||||
|
print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET)
|
||||||
|
return titlelist, urllist
|
||||||
|
# await bingwriteCSV(titlelist, urllist, keyword)
|
||||||
|
|
||||||
|
|
||||||
|
def bing_main(keyword,num):
|
||||||
|
keyword = quote(keyword)
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return loop.run_until_complete(bing_spinder(keyword,num))
|
||||||
|
|
||||||
|
async def Bing_main(keywords, num):
|
||||||
|
return await bing_spinder(keywords, num)
|
||||||
BIN
search/__pycache__/Baidu.cpython-313.pyc
Normal file
BIN
search/__pycache__/Baidu.cpython-313.pyc
Normal file
Binary file not shown.
BIN
search/__pycache__/Bing.cpython-313.pyc
Normal file
BIN
search/__pycache__/Bing.cpython-313.pyc
Normal file
Binary file not shown.
BIN
tool/__pycache__/read_csv.cpython-313.pyc
Normal file
BIN
tool/__pycache__/read_csv.cpython-313.pyc
Normal file
Binary file not shown.
190
tool/read_csv.py
Normal file
190
tool/read_csv.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
import csv
|
||||||
|
from typing import List, Dict, Union, Any, Optional
|
||||||
|
|
||||||
|
class CSVReader:
|
||||||
|
"""
|
||||||
|
CSV文件读取工具类
|
||||||
|
支持有表头和无表头模式,可按列索引或表头字段名返回数据
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""
|
||||||
|
初始化CSV读取器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: CSV文件路径
|
||||||
|
"""
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def read(self, has_header: bool = True, encoding: str = 'utf-8') -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
读取CSV文件并返回查询结果
|
||||||
|
|
||||||
|
Args:
|
||||||
|
has_header: 是否有表头,默认为True
|
||||||
|
encoding: 文件编码,默认为utf-8
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Dict[str, Any]]: 查询结果列表,每个元素是一行数据的字典表示
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
with open(self.file_path, 'r', encoding=encoding) as file:
|
||||||
|
if has_header:
|
||||||
|
# 使用DictReader处理有表头的CSV
|
||||||
|
reader = csv.DictReader(file)
|
||||||
|
for row in reader:
|
||||||
|
data.append(dict(row))
|
||||||
|
else:
|
||||||
|
# 使用普通reader处理无表头的CSV
|
||||||
|
reader = csv.reader(file)
|
||||||
|
for row in reader:
|
||||||
|
# 为无表头的行创建字典,使用列索引作为键
|
||||||
|
row_dict = {f'col_{i}': value for i, value in enumerate(row)}
|
||||||
|
data.append(row_dict)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def read_column(self, column: Union[int, str], has_header: bool = True,
|
||||||
|
encoding: str = 'utf-8') -> List[Any]:
|
||||||
|
"""
|
||||||
|
读取指定列的数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
column: 列索引(从0开始)或列名
|
||||||
|
has_header: 是否有表头
|
||||||
|
encoding: 文件编码
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Any]: 指定列的数据列表
|
||||||
|
"""
|
||||||
|
data = self.read(has_header, encoding)
|
||||||
|
|
||||||
|
if has_header and isinstance(column, str):
|
||||||
|
# 有表头且指定了列名
|
||||||
|
return [row[column] for row in data]
|
||||||
|
elif isinstance(column, int):
|
||||||
|
# 指定了列索引
|
||||||
|
if has_header:
|
||||||
|
# 有表头时需要获取列名
|
||||||
|
if data:
|
||||||
|
keys = list(data[0].keys())
|
||||||
|
if 0 <= column < len(keys):
|
||||||
|
column_name = keys[column]
|
||||||
|
return [row[column_name] for row in data]
|
||||||
|
else:
|
||||||
|
raise IndexError(f"Column index {column} out of range")
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
# 无表头时使用默认列名
|
||||||
|
column_name = f'col_{column}'
|
||||||
|
return [row[column_name] for row in data if column_name in row]
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid column parameter")
|
||||||
|
|
||||||
|
def read_columns(self, columns: Dict[str, Union[int, str]], has_header: bool = True,
|
||||||
|
encoding: str = 'utf-8') -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
读取指定的多列数据,可以重命名列名
|
||||||
|
|
||||||
|
Args:
|
||||||
|
columns: 字典,键为返回结果中的列名,值为原CSV中的列索引或列名
|
||||||
|
has_header: 是否有表头
|
||||||
|
encoding: 文件编码
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Dict[str, Any]]: 指定列的数据列表
|
||||||
|
"""
|
||||||
|
all_data = self.read(has_header, encoding)
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# 获取所有列名
|
||||||
|
if all_data:
|
||||||
|
header_keys = list(all_data[0].keys())
|
||||||
|
else:
|
||||||
|
header_keys = []
|
||||||
|
|
||||||
|
for row in all_data:
|
||||||
|
new_row = {}
|
||||||
|
for new_name, old_column in columns.items():
|
||||||
|
if isinstance(old_column, str) and has_header:
|
||||||
|
# 按列名获取值
|
||||||
|
new_row[new_name] = row.get(old_column, '')
|
||||||
|
elif isinstance(old_column, int):
|
||||||
|
# 按列索引获取值
|
||||||
|
if has_header:
|
||||||
|
if 0 <= old_column < len(header_keys):
|
||||||
|
key = header_keys[old_column]
|
||||||
|
new_row[new_name] = row.get(key, '')
|
||||||
|
else:
|
||||||
|
new_row[new_name] = ''
|
||||||
|
else:
|
||||||
|
key = f'col_{old_column}'
|
||||||
|
new_row[new_name] = row.get(key, '')
|
||||||
|
result.append(new_row)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 使用示例
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 示例1: 有表头的CSV文件
|
||||||
|
# 假设有一个名为data.csv的文件内容如下:
|
||||||
|
# name,age,city
|
||||||
|
# Alice,25,Beijing
|
||||||
|
# Bob,30,Shanghai
|
||||||
|
# Charlie,35,Guangzhou
|
||||||
|
|
||||||
|
reader = CSVReader('../data.csv')
|
||||||
|
# 读取所有数据
|
||||||
|
#all_data = reader.read(has_header=False)
|
||||||
|
# 读取所有数据
|
||||||
|
# all_data = reader.read(has_header=True)
|
||||||
|
|
||||||
|
|
||||||
|
# print("所有数据:", all_data)
|
||||||
|
|
||||||
|
selected_data_no_header = reader.read_columns({
|
||||||
|
'company_name': 0
|
||||||
|
}, has_header=False)
|
||||||
|
print("所有数据:", selected_data_no_header)
|
||||||
|
|
||||||
|
selected_data_no_header = reader.read_column(0, has_header=False)
|
||||||
|
print("所有数据:", selected_data_no_header)
|
||||||
|
# # 读取指定列(按列名)
|
||||||
|
# names = reader.read_column('name', has_header=True)
|
||||||
|
# print("姓名列:", names)
|
||||||
|
#
|
||||||
|
# # 读取指定列(按索引)
|
||||||
|
# ages = reader.read_column(1, has_header=True)
|
||||||
|
# print("年龄列:", ages)
|
||||||
|
#
|
||||||
|
# # 读取多列并重命名
|
||||||
|
# selected_data = reader.read_columns({
|
||||||
|
# '姓名': 'name',
|
||||||
|
# '年龄': 1
|
||||||
|
# }, has_header=True)
|
||||||
|
# print("选择的数据:", selected_data)
|
||||||
|
#
|
||||||
|
# # 示例2: 无表头的CSV文件
|
||||||
|
# # 假设有一个名为data_no_header.csv的文件内容如下:
|
||||||
|
# # Alice,25,Beijing
|
||||||
|
# # Bob,30,Shanghai
|
||||||
|
# # Charlie,35,Guangzhou
|
||||||
|
#
|
||||||
|
# reader2 = CSVReader('data_no_header.csv')
|
||||||
|
#
|
||||||
|
# # 读取所有数据
|
||||||
|
# all_data_no_header = reader2.read(has_header=False)
|
||||||
|
# print("无表头所有数据:", all_data_no_header)
|
||||||
|
#
|
||||||
|
# # 读取指定列(按索引)
|
||||||
|
# first_column = reader2.read_column(0, has_header=False)
|
||||||
|
# print("第一列:", first_column)
|
||||||
|
#
|
||||||
|
# # 读取多列并指定名称
|
||||||
|
# selected_data_no_header = reader2.read_columns({
|
||||||
|
# '姓名': 0,
|
||||||
|
# '城市': 2
|
||||||
|
# }, has_header=False)
|
||||||
|
# print("无表头选择的数据:", selected_data_no_header)
|
||||||
BIN
~$亚信科技(南京)有限公司-亚信科技(南京)有限公司.xlsx
Normal file
BIN
~$亚信科技(南京)有限公司-亚信科技(南京)有限公司.xlsx
Normal file
Binary file not shown.
BIN
~$杭州云印智造科技有限公司-杭州云印智造科技有限公司.xlsx
Normal file
BIN
~$杭州云印智造科技有限公司-杭州云印智造科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)-中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅).xlsx
Normal file
BIN
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)-中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅).xlsx
Normal file
Binary file not shown.
BIN
亚信科技(南京)有限公司-亚信科技(南京)有限公司.xlsx
Normal file
BIN
亚信科技(南京)有限公司-亚信科技(南京)有限公司.xlsx
Normal file
Binary file not shown.
BIN
医贝云服(杭州)科技有限公司-医贝云服(杭州)科技有限公司.xlsx
Normal file
BIN
医贝云服(杭州)科技有限公司-医贝云服(杭州)科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
天禄(杭州)科技有限公司-天禄(杭州)科技有限公司.xlsx
Normal file
BIN
天禄(杭州)科技有限公司-天禄(杭州)科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
如是启创(杭州)科技有限公司-如是启创(杭州)科技有限公司.xlsx
Normal file
BIN
如是启创(杭州)科技有限公司-如是启创(杭州)科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
幻想集团·杭州运营中心-幻想集团·杭州运营中心.xlsx
Normal file
BIN
幻想集团·杭州运营中心-幻想集团·杭州运营中心.xlsx
Normal file
Binary file not shown.
BIN
新疆浙疆果业有限公司-新疆浙疆果业有限公司.xlsx
Normal file
BIN
新疆浙疆果业有限公司-新疆浙疆果业有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州一喂智能科技有限公司-杭州一喂智能科技有限公司.xlsx
Normal file
BIN
杭州一喂智能科技有限公司-杭州一喂智能科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州乂游网络科技有限公司-杭州乂游网络科技有限公司.xlsx
Normal file
BIN
杭州乂游网络科技有限公司-杭州乂游网络科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州云印智造科技有限公司-杭州云印智造科技有限公司.xlsx
Normal file
BIN
杭州云印智造科技有限公司-杭州云印智造科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州云迹物联科技有限公司-杭州云迹物联科技有限公司.xlsx
Normal file
BIN
杭州云迹物联科技有限公司-杭州云迹物联科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州人谋天成科技有限公司-杭州人谋天成科技有限公司.xlsx
Normal file
BIN
杭州人谋天成科技有限公司-杭州人谋天成科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州伍壹荟旅游咨资询有限公司-杭州伍壹荟旅游咨资询有限公司.xlsx
Normal file
BIN
杭州伍壹荟旅游咨资询有限公司-杭州伍壹荟旅游咨资询有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州倍驰科技有限公司-杭州倍驰科技有限公司.xlsx
Normal file
BIN
杭州倍驰科技有限公司-杭州倍驰科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州千骏轴承有限公司-杭州千骏轴承有限公司.xlsx
Normal file
BIN
杭州千骏轴承有限公司-杭州千骏轴承有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州塞牧文化传媒有限公司-杭州塞牧文化传媒有限公司.xlsx
Normal file
BIN
杭州塞牧文化传媒有限公司-杭州塞牧文化传媒有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州奥得徕贸易有限公司-杭州奥得徕贸易有限公司.xlsx
Normal file
BIN
杭州奥得徕贸易有限公司-杭州奥得徕贸易有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州字节跳动科技有限公司-杭州字节跳动科技有限公司.xlsx
Normal file
BIN
杭州字节跳动科技有限公司-杭州字节跳动科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州孚伦特科技有限公司-杭州孚伦特科技有限公司.xlsx
Normal file
BIN
杭州孚伦特科技有限公司-杭州孚伦特科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州密尔沃智能装备有限公司-杭州密尔沃智能装备有限公司.xlsx
Normal file
BIN
杭州密尔沃智能装备有限公司-杭州密尔沃智能装备有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州巨量引擎网络技术有限公司-杭州巨量引擎网络技术有限公司.xlsx
Normal file
BIN
杭州巨量引擎网络技术有限公司-杭州巨量引擎网络技术有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州心满意定供应链服务有限公司-杭州心满意定供应链服务有限公司.xlsx
Normal file
BIN
杭州心满意定供应链服务有限公司-杭州心满意定供应链服务有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州心灵部落教育有限公司(灵动生活)-杭州心灵部落教育有限公司(灵动生活).xlsx
Normal file
BIN
杭州心灵部落教育有限公司(灵动生活)-杭州心灵部落教育有限公司(灵动生活).xlsx
Normal file
Binary file not shown.
BIN
杭州慈山科技有限公司-杭州慈山科技有限公司.xlsx
Normal file
BIN
杭州慈山科技有限公司-杭州慈山科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州扬拓体育科技有限公司-杭州扬拓体育科技有限公司.xlsx
Normal file
BIN
杭州扬拓体育科技有限公司-杭州扬拓体育科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州摸象大数据科技有限公司-杭州摸象大数据科技有限公司.xlsx
Normal file
BIN
杭州摸象大数据科技有限公司-杭州摸象大数据科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州星瀚知识产权代理有限公司-杭州星瀚知识产权代理有限公司.xlsx
Normal file
BIN
杭州星瀚知识产权代理有限公司-杭州星瀚知识产权代理有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州晓羽科技有限公司-杭州晓羽科技有限公司.xlsx
Normal file
BIN
杭州晓羽科技有限公司-杭州晓羽科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州未知数品牌管理有限公司-杭州未知数品牌管理有限公司.xlsx
Normal file
BIN
杭州未知数品牌管理有限公司-杭州未知数品牌管理有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州琑为缘文化艺术有限公司-杭州琑为缘文化艺术有限公司.xlsx
Normal file
BIN
杭州琑为缘文化艺术有限公司-杭州琑为缘文化艺术有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州瑞泡特教育科技有限公司-杭州瑞泡特教育科技有限公司.xlsx
Normal file
BIN
杭州瑞泡特教育科技有限公司-杭州瑞泡特教育科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州瑾馨贸易集团有限公司-杭州瑾馨贸易集团有限公司.xlsx
Normal file
BIN
杭州瑾馨贸易集团有限公司-杭州瑾馨贸易集团有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州甬盛通信技术有限公司-杭州甬盛通信技术有限公司.xlsx
Normal file
BIN
杭州甬盛通信技术有限公司-杭州甬盛通信技术有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州着墨文化创意者限公司-杭州着墨文化创意者限公司.xlsx
Normal file
BIN
杭州着墨文化创意者限公司-杭州着墨文化创意者限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州禾露则正生物科技有限公司-杭州禾露则正生物科技有限公司.xlsx
Normal file
BIN
杭州禾露则正生物科技有限公司-杭州禾露则正生物科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州绿城衡宇环境设计有限公司-杭州绿城衡宇环境设计有限公司.xlsx
Normal file
BIN
杭州绿城衡宇环境设计有限公司-杭州绿城衡宇环境设计有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州羊咩咩文化传媒有限公司-杭州羊咩咩文化传媒有限公司.xlsx
Normal file
BIN
杭州羊咩咩文化传媒有限公司-杭州羊咩咩文化传媒有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州裕阳经营管理合伙企业(有限合伙)-杭州裕阳经营管理合伙企业(有限合伙).xlsx
Normal file
BIN
杭州裕阳经营管理合伙企业(有限合伙)-杭州裕阳经营管理合伙企业(有限合伙).xlsx
Normal file
Binary file not shown.
BIN
杭州贝享健康科技有限公司-杭州贝享健康科技有限公司.xlsx
Normal file
BIN
杭州贝享健康科技有限公司-杭州贝享健康科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州辉望科技有限公司-杭州辉望科技有限公司.xlsx
Normal file
BIN
杭州辉望科技有限公司-杭州辉望科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州辉煌物业管理有限公司-杭州辉煌物业管理有限公司.xlsx
Normal file
BIN
杭州辉煌物业管理有限公司-杭州辉煌物业管理有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州邻汇网络科技有限公司-杭州邻汇网络科技有限公司.xlsx
Normal file
BIN
杭州邻汇网络科技有限公司-杭州邻汇网络科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州钱橙似锦科技有限公司-杭州钱橙似锦科技有限公司.xlsx
Normal file
BIN
杭州钱橙似锦科技有限公司-杭州钱橙似锦科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州锐擎科技有限公司-杭州锐擎科技有限公司.xlsx
Normal file
BIN
杭州锐擎科技有限公司-杭州锐擎科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州阿克莱斯设备有限公司-杭州阿克莱斯设备有限公司.xlsx
Normal file
BIN
杭州阿克莱斯设备有限公司-杭州阿克莱斯设备有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州霖思网络科技有限公司-杭州霖思网络科技有限公司.xlsx
Normal file
BIN
杭州霖思网络科技有限公司-杭州霖思网络科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州音视贝科技有限公司-杭州音视贝科技有限公司.xlsx
Normal file
BIN
杭州音视贝科技有限公司-杭州音视贝科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州颐刻生物科技有限公司-杭州颐刻生物科技有限公司.xlsx
Normal file
BIN
杭州颐刻生物科技有限公司-杭州颐刻生物科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州骏远电子商务有限公司-杭州骏远电子商务有限公司.xlsx
Normal file
BIN
杭州骏远电子商务有限公司-杭州骏远电子商务有限公司.xlsx
Normal file
Binary file not shown.
BIN
杭州麒晨科技有限公司-杭州麒晨科技有限公司.xlsx
Normal file
BIN
杭州麒晨科技有限公司-杭州麒晨科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
氧气.康复中心-氧气.康复中心.xlsx
Normal file
BIN
氧气.康复中心-氧气.康复中心.xlsx
Normal file
Binary file not shown.
BIN
浙江丝里伯睡眠科技股份有限公司-浙江丝里伯睡眠科技股份有限公司.xlsx
Normal file
BIN
浙江丝里伯睡眠科技股份有限公司-浙江丝里伯睡眠科技股份有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江全应科技有限公司-浙江全应科技有限公司.xlsx
Normal file
BIN
浙江全应科技有限公司-浙江全应科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江八方电信科技集团有限公司-浙江八方电信科技集团有限公司.xlsx
Normal file
BIN
浙江八方电信科技集团有限公司-浙江八方电信科技集团有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江力一科技有限公司-浙江力一科技有限公司.xlsx
Normal file
BIN
浙江力一科技有限公司-浙江力一科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江商盟支付有限公司-浙江商盟支付有限公司.xlsx
Normal file
BIN
浙江商盟支付有限公司-浙江商盟支付有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江建盛安全科技有限公司-浙江建盛安全科技有限公司.xlsx
Normal file
BIN
浙江建盛安全科技有限公司-浙江建盛安全科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江彩屋信息技术有限公司-浙江彩屋信息技术有限公司.xlsx
Normal file
BIN
浙江彩屋信息技术有限公司-浙江彩屋信息技术有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江根旺律师事务所-浙江根旺律师事务所.xlsx
Normal file
BIN
浙江根旺律师事务所-浙江根旺律师事务所.xlsx
Normal file
Binary file not shown.
BIN
浙江海拓环境技术有限公司-浙江海拓环境技术有限公司.xlsx
Normal file
BIN
浙江海拓环境技术有限公司-浙江海拓环境技术有限公司.xlsx
Normal file
Binary file not shown.
BIN
浙江省现代农业促进会-浙江省现代农业促进会.xlsx
Normal file
BIN
浙江省现代农业促进会-浙江省现代农业促进会.xlsx
Normal file
Binary file not shown.
BIN
煜邦电力智能装备(嘉兴)有限公司-煜邦电力智能装备(嘉兴)有限公司.xlsx
Normal file
BIN
煜邦电力智能装备(嘉兴)有限公司-煜邦电力智能装备(嘉兴)有限公司.xlsx
Normal file
Binary file not shown.
BIN
瑞凤九天(杭州)科技有限公司-瑞凤九天(杭州)科技有限公司.xlsx
Normal file
BIN
瑞凤九天(杭州)科技有限公司-瑞凤九天(杭州)科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
申能环境科技有限公司-申能环境科技有限公司.xlsx
Normal file
BIN
申能环境科技有限公司-申能环境科技有限公司.xlsx
Normal file
Binary file not shown.
BIN
益思芯科技(杭州)有限公司-益思芯科技(杭州)有限公司.xlsx
Normal file
BIN
益思芯科技(杭州)有限公司-益思芯科技(杭州)有限公司.xlsx
Normal file
Binary file not shown.
BIN
远大住宅工业(杭州)有限公司-远大住宅工业(杭州)有限公司.xlsx
Normal file
BIN
远大住宅工业(杭州)有限公司-远大住宅工业(杭州)有限公司.xlsx
Normal file
Binary file not shown.
BIN
金码智能科技(杭州)有限公司-金码智能科技(杭州)有限公司.xlsx
Normal file
BIN
金码智能科技(杭州)有限公司-金码智能科技(杭州)有限公司.xlsx
Normal file
Binary file not shown.
BIN
风华(杭州)信息技术有限公司-风华(杭州)信息技术有限公司.xlsx
Normal file
BIN
风华(杭州)信息技术有限公司-风华(杭州)信息技术有限公司.xlsx
Normal file
Binary file not shown.
Reference in New Issue
Block a user