This commit is contained in:
manchuwork
2025-09-05 16:46:09 +08:00
commit 9d0f18a121
95 changed files with 1883 additions and 0 deletions

3
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

7
.idea/MarsCodeWorkspaceAppSettings.xml generated Normal file
View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="com.codeverse.userSettings.MarscodeWorkspaceAppSettingsState">
<option name="ckgOperationStatus" value="SUCCESS" />
<option name="progress" value="1.0" />
</component>
</project>

10
.idea/SearchCompany.iml generated Normal file
View File

@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 (SearchCompany)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

6
.idea/misc.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.13 (SearchCompany)" />
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/SearchCompany.iml" filepath="$PROJECT_DIR$/.idea/SearchCompany.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

1
Readme.md Normal file
View File

@@ -0,0 +1 @@
pip install -r requirements.txt

Binary file not shown.

0
company/aiqicha.py Normal file
View File

344
company/qcc.py Normal file
View File

@@ -0,0 +1,344 @@
# qcc.py
import json
import re
import os
from playwright.sync_api import sync_playwright
import argparse
def clean_text(text):
"""
清理文本内容,去除多余空白字符
"""
if not text:
return ""
# 替换多个空白字符为单个空格,并去除首尾空白
return re.sub(r'\s+', ' ', text.replace('\r', '').replace('\n', '').replace('\t', '')).strip()
class QCCParser:
def __init__(self, page):
self.page = page
self.company_data = {}
def init_table(self):
"""
初始化表格元素
"""
# 等待页面加载完成
self.page.wait_for_load_state('networkidle')
# 查找企业信息容器
cominfo_normal = self.page.query_selector("div.cominfo-normal")
if not cominfo_normal:
print("未找到企业信息容器")
return False
# 查找企业信息表格
self.table = cominfo_normal.query_selector("table.ntable")
if not self.table:
print("未找到企业信息表格")
return False
return True
def get_optimized_value(self, title):
"""
根据标题获取对应的值
"""
# 查找所有表头单元格
header_cells = self.table.query_selector_all("td.tb")
value = None
for header in header_cells:
if clean_text(header.text_content()).find(title) != -1:
value_cell = header.query_selector("+ td")
if value_cell:
# 尝试从copy-value类中获取值
copy_value = value_cell.query_selector(".copy-value")
if copy_value:
value = clean_text(copy_value.text_content())
else:
value = clean_text(value_cell.text_content())
break
return value
def get_legal_representative(self):
"""
获取法定代表人
"""
# 尝试基本方法获取
basic_value = self.get_optimized_value("法定代表人") or self.get_optimized_value("法人")
if basic_value and basic_value.strip():
# 移除"关联企业 X"等附加信息
return re.sub(r'\s*关联企业\s*\d+$', '', basic_value).strip()
# 查找法定代表人表头
header_cell = None
for cell in self.table.query_selector_all("td.tb"):
if clean_text(cell.text_content()).find("法定代表人") != -1:
header_cell = cell
break
if not header_cell:
return None
value_cell = header_cell.query_selector("+ td")
if not value_cell:
return None
# 查找包含法定代表人姓名的链接
name_links = value_cell.query_selector_all('a[target="_blank"]')
for link in name_links:
name = clean_text(link.text_content())
# 确保不是空值且不包含非姓名文本
if name and "关联企业" not in name and "复制" not in name:
return name
# 备选方案:查找第一个链接
first_link = value_cell.query_selector("a")
if first_link:
name = clean_text(first_link.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
# 最后备选方案:直接提取文本
copy_value = value_cell.query_selector(".copy-value")
if copy_value:
name = clean_text(copy_value.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', name).strip()
raw_text = clean_text(value_cell.text_content())
return re.sub(r'\s*关联企业\s*\d+$', '', raw_text).strip()
def get_unified_social_credit_code(self):
"""
获取统一社会信用代码
"""
return (self.get_optimized_value("统一社会信用代码") or
self.get_optimized_value("信用代码"))
def get_business_registration_no(self):
"""
获取工商注册号
"""
return (self.get_optimized_value("工商注册号") or
self.get_optimized_value("注册号"))
def get_organization_code(self):
"""
获取组织机构代码
"""
return self.get_optimized_value("组织机构代码")
def get_taxpayer_id(self):
"""
获取纳税人识别号
"""
return (self.get_optimized_value("纳税人识别号") or
self.get_unified_social_credit_code())
def get_insurance_number(self):
"""
获取参保人数
"""
# 查找参保人数表头
header_cell = None
for cell in self.table.query_selector_all("td.tb"):
if clean_text(cell.text_content()).find("参保人数") != -1:
header_cell = cell
break
if not header_cell:
return None
value_cell = header_cell.query_selector("+ td")
if not value_cell:
return None
# 提取参保人数数字
number_span = value_cell.query_selector("span")
number = clean_text(number_span.text_content()) if number_span else None
# 提取年报年份
report_link = value_cell.query_selector("a.m-l-r-10")
report_year = clean_text(report_link.text_content()) if report_link else ""
# 组合结果
return f"{number}{report_year}" if number else None
def get_phone_number(self):
"""
获取联系电话
"""
# 查找联系信息容器
contact_info = self.page.query_selector("div.contact-info")
if not contact_info:
return None
# 查找右侧信息区域
right_part = contact_info.query_selector("div.main-part-item.right")
if not right_part:
return None
# 查找包含电话的行
rows = right_part.query_selector_all("div.rline")
phone_row = None
for row in rows:
if clean_text(row.text_content()).find("电话:") != -1:
phone_row = row
break
if not phone_row:
return None
# 提取电话号码
spans = phone_row.query_selector_all("span.need-copy-field")
phone_span = None
for span in spans:
if clean_text(span.text_content()).find("电话:") == -1:
phone_span = span
break
return clean_text(phone_span.text_content()) if phone_span else None
def get_approval_date(self):
"""
获取核准日期
"""
return (self.get_optimized_value("核准日期") or
self.get_optimized_value("成立日期"))
def parse_company_info(self):
"""
解析公司信息主方法
"""
if not self.init_table():
return None
self.company_data = {
"企业名称": (self.get_optimized_value("企业名称") or
self.get_optimized_value("公司名称")),
"统一社会信用代码": self.get_unified_social_credit_code(),
"法定代表人": self.get_legal_representative(),
"经营状态": self.get_optimized_value("登记状态"),
"成立日期": self.get_optimized_value("成立日期"),
"行政区划": self.get_optimized_value("行政区划"),
"注册资本": self.get_optimized_value("注册资本"),
"实缴资本": self.get_optimized_value("实缴资本"),
"企业类型": self.get_optimized_value("企业类型"),
"所属行业": self.get_optimized_value("国标行业"),
"工商注册号": self.get_business_registration_no(),
"组织机构代码": self.get_organization_code(),
"纳税人识别号": self.get_taxpayer_id(),
"纳税人资质": self.get_optimized_value("纳税人资质"),
"营业期限": self.get_optimized_value("营业期限"),
"核准日期": self.get_approval_date(),
"参保人数": self.get_insurance_number(),
"电话": self.get_phone_number(),
"登记机关": self.get_optimized_value("登记机关"),
"曾用名": self.get_optimized_value("曾用名"),
"注册地址": self.get_optimized_value("注册地址"),
"经营范围": self.get_optimized_value("经营范围"),
}
return self.company_data
def load_cookies(context, cookie_file):
"""
从文件加载cookies
"""
if os.path.exists(cookie_file):
with open(cookie_file, 'r') as f:
cookies = json.load(f)
context.add_cookies(cookies)
print("已加载本地cookies")
return True
return False
def save_cookies(context, cookie_file):
"""
保存cookies到文件
"""
cookies = context.cookies()
with open(cookie_file, 'w') as f:
json.dump(cookies, f)
print("已保存cookies到文件")
def wait_for_login(page, cookie_file):
"""
等待用户扫码登录
"""
print("检测到需要登录,请使用手机扫码登录...")
print("登录成功后将自动跳转到目标页面")
# 等待页面跳转到非登录页面
page.wait_for_url("**/weblogin", timeout=3000)
page.wait_for_url(lambda url: "weblogin" not in url, timeout=120000)
# 保存登录后的cookies
save_cookies(page.context, cookie_file)
print("登录成功已保存cookies")
def main():
parser = argparse.ArgumentParser(description='解析企查查公司信息')
parser.add_argument('url', help='企查查公司页面URL')
parser.add_argument('--headless', action='store_true', help='无头模式运行')
parser.add_argument('--cookie-file', default='qcc_cookies.txt', help='cookies文件路径')
args = parser.parse_args()
with sync_playwright() as p:
# 启动浏览器
browser = p.chromium.launch(headless=args.headless)
context = browser.new_context()
page = context.new_page()
try:
# 尝试加载本地保存的cookies
if load_cookies(context, args.cookie_file):
print("使用已保存的登录信息")
# 访问指定URL
page.goto(args.url)
# 检查是否跳转到了登录页面
if "weblogin" in page.url:
wait_for_login(page, args.cookie_file)
else:
print("已登录或无需登录")
# 重新访问目标URL确保页面正确加载
page.goto(args.url)
# 创建解析器并解析信息
parser = QCCParser(page)
company_info = parser.parse_company_info()
if company_info:
# 格式化输出JSON
print(json.dumps(company_info, ensure_ascii=False, indent=2))
else:
print("未能获取公司信息")
except Exception as e:
print(f"发生错误: {e}")
finally:
browser.close()
if __name__ == "__main__":
main()
# python qcc.py "https://www.qcc.com/firm/50b0e3189f2eb2b20304b255669ce1a1.html"
# # 首次运行需要扫码登录
# python qcc.py "https://www.qcc.com/firm/公司URL"
#
# # 后续运行将自动使用已保存的登录信息
# python qcc.py "https://www.qcc.com/firm/公司URL"
#
# # 指定自定义cookies文件
# python qcc.py --cookie-file my_cookies.txt "https://www.qcc.com/firm/公司URL"

831
company/youhou.js Normal file
View File

@@ -0,0 +1,831 @@
// ==UserScript==
// @name 爱企查&企查查等
// @namespace http://tampermonkey.net/
// @version 0.2
// @description 在页面右下角添加工具按钮,支持复制源码和解析公司信息
// @author You
// @match https://www.qcc.com/firm/*
// @match https://aiqicha.baidu.com/company_detail_*
// @grant none
// ==/UserScript==
(function () {
"use strict";
// 工具类 - 存放通用函数
class ToolUtils {
static cleanText(text) {
return text
.replace(/\s+/g, " ")
.replace(/[\r\n\t]/g, "")
.trim();
}
static extractText(doc, selectors) {
for (const selector of selectors) {
const element = doc.querySelector(selector);
if (element && element.textContent.trim()) {
return element.textContent.trim();
}
}
return "";
}
static copyToClipboard(content, successMessage) {
const textarea = document.createElement("textarea");
textarea.value = content;
textarea.style.position = "fixed";
textarea.style.top = "0";
textarea.style.left = "0";
textarea.style.width = "1px";
textarea.style.height = "1px";
textarea.style.opacity = "0";
document.body.appendChild(textarea);
textarea.select();
document.execCommand("copy");
document.body.removeChild(textarea);
if (successMessage) {
this.showAutoCloseMessage(successMessage, "success");
}
}
static showAutoCloseMessage(message, type = "info") {
// 创建一个自动关闭的提示框替代 alert
const alertBox = document.createElement("div");
alertBox.textContent = message;
alertBox.style.position = "fixed";
alertBox.style.top = "50%";
alertBox.style.left = "50%";
alertBox.style.transform = "translate(-50%, -50%)";
// 根据消息类型设置不同颜色
if (type === "success") {
alertBox.style.backgroundColor = "#52c41a";
} else if (type === "error") {
alertBox.style.backgroundColor = "#f5222d";
} else {
alertBox.style.backgroundColor = "#1890ff";
}
alertBox.style.color = "white";
alertBox.style.padding = "10px 20px";
alertBox.style.borderRadius = "4px";
alertBox.style.zIndex = "10001";
alertBox.style.boxShadow = "0 2px 8px rgba(0,0,0,0.15)";
alertBox.style.transition = "opacity 0.3s";
document.body.appendChild(alertBox);
// 2秒后自动关闭
setTimeout(() => {
if (document.body.contains(alertBox)) {
// 添加淡出效果
alertBox.style.opacity = "0";
// 真正移除元素
setTimeout(() => {
if (document.body.contains(alertBox)) {
document.body.removeChild(alertBox);
}
}, 300);
}
}, 2000);
}
static showResult(data) {
const modal = document.createElement("div");
modal.style.position = "fixed";
modal.style.top = "50%";
modal.style.left = "50%";
modal.style.transform = "translate(-50%, -50%)";
modal.style.width = "600px";
modal.style.maxHeight = "80vh";
modal.style.overflowY = "auto";
modal.style.backgroundColor = "white";
modal.style.padding = "20px";
modal.style.boxShadow = "0 0 10px rgba(0,0,0,0.3)";
modal.style.zIndex = "10000";
const pre = document.createElement("pre");
pre.textContent = JSON.stringify(data, null, 2);
pre.style.whiteSpace = "pre-wrap";
pre.style.wordWrap = "break-word";
const copyBtn = document.createElement("button");
copyBtn.textContent = "复制JSON";
copyBtn.style.marginTop = "10px";
copyBtn.style.padding = "8px 16px";
copyBtn.style.backgroundColor = "#52c41a";
copyBtn.style.color = "white";
copyBtn.style.border = "none";
copyBtn.style.borderRadius = "4px";
copyBtn.style.cursor = "pointer";
copyBtn.addEventListener("click", () => {
navigator.clipboard
.writeText(JSON.stringify(data, null, 2))
.then(() => this.showAutoCloseMessage("已复制到剪贴板", "success"))
.catch((err) => alert("复制失败: " + err));
});
const closeBtn = document.createElement("button");
closeBtn.textContent = "关闭";
closeBtn.style.marginLeft = "10px";
closeBtn.style.marginTop = "10px";
closeBtn.style.padding = "8px 16px";
closeBtn.style.backgroundColor = "#f5222d";
closeBtn.style.color = "white";
closeBtn.style.border = "none";
closeBtn.style.borderRadius = "4px";
closeBtn.style.cursor = "pointer";
closeBtn.addEventListener("click", () => {
document.body.removeChild(modal);
});
modal.innerHTML = '<h2 style="margin-top: 0;">企业信息解析结果</h2>';
modal.appendChild(pre);
modal.appendChild(document.createElement("br"));
modal.appendChild(copyBtn);
modal.appendChild(closeBtn);
document.body.appendChild(modal);
// // Automatically close the modal after 2 seconds
// setTimeout(() => {
// if (document.body.contains(modal)) {
// document.body.removeChild(modal);
// }
// }, 2000);
}
}
// 爱企查解析类
class AiQiChaParser {
getPhoneNumber() {
// 查找电话信息容器
const phoneContainer = document.querySelector(
"div.business-info div.telphone-lists-wrap"
);
if (!phoneContainer) return "未找到电话信息";
// 查找包含电话号码的元素
const phoneElement = phoneContainer.querySelector("span.copy-box span");
if (!phoneElement) return "未找到电话号码";
return ToolUtils.cleanText(phoneElement.textContent);
}
constructor() {
this.table = null;
}
// 初始化表格
initTable() {
this.table = document.querySelector("table.zx-detail-basic-table");
if (!this.table) {
alert("未找到企业信息表格");
return false;
}
return true;
}
// 获取优化后的值
getOptimizedValue(title) {
const cells = Array.from(this.table.querySelectorAll("td"));
const titleCell = cells.find(
(cell) => ToolUtils.cleanText(cell.textContent) === title
);
if (!titleCell) return null;
let valueCell = titleCell.nextElementSibling;
if (!valueCell) return null;
const valueElement =
valueCell.querySelector(".enter-bg-ele") ||
valueCell.querySelector(".addr-enter-bg-ele") ||
valueCell;
return ToolUtils.cleanText(valueElement.textContent);
}
// 获取法定代表人
getLegalRepresentative() {
const legalElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "法定代表人");
if (legalElements.length > 0) {
const valueCell = legalElements[0].nextElementSibling;
if (valueCell && valueCell.classList.contains("image-text-content")) {
const nameElement = valueCell.querySelector(".person-name-warp a");
if (nameElement) {
return ToolUtils.cleanText(nameElement.textContent);
}
return ToolUtils.cleanText(valueCell.textContent);
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => td.textContent.includes("法定代表人"));
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
return ToolUtils.cleanText(valueCell.textContent);
}
return null;
}
// 获取统一社会信用代码
getUnifiedSocialCreditCode() {
const codeElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => {
return (
td.textContent.includes("统一社会信用代码") &&
td.nextElementSibling &&
td.nextElementSibling.classList.contains("table-regCapital-lable")
);
}
);
if (codeElements.length > 0) {
const valueCell = codeElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => td.textContent.includes("纳税人识别号")
);
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
const valueCell = taxElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取工商注册号
getBusinessRegistrationNo() {
const regElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => ToolUtils.cleanText(td.textContent).includes("工商注册号")
);
if (regElements.length > 0 && regElements[0].nextElementSibling) {
const valueCell = regElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取组织机构代码
getOrganizationCode() {
const orgCodeElements = Array.from(
this.table.querySelectorAll(".poptip-wrap-org-no")
).filter((el) => el.textContent.includes("组织机构代码"));
if (orgCodeElements.length > 0) {
const valueCell = orgCodeElements[0].closest("td").nextElementSibling;
if (valueCell && valueCell.classList.contains("enter-bg")) {
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "组织机构代码");
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取纳税人识别号
getTaxpayerId() {
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => ToolUtils.cleanText(td.textContent).includes("纳税人识别号")
);
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
const valueCell = taxElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
const creditElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) =>
ToolUtils.cleanText(td.textContent).includes("统一社会信用代码")
);
if (creditElements.length > 0 && creditElements[0].nextElementSibling) {
const valueCell = creditElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取参保人数
getInsuranceNumber() {
const insuranceElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => {
return (
td.textContent.includes("参保人数") &&
td.querySelector(".insurance-info")
);
});
if (insuranceElements.length > 0) {
const valueCell = insuranceElements[0].nextElementSibling;
if (!valueCell) return null;
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
const match = rawText.match(/(\d+人)/);
return match ? match[0] : null;
}
const registrationElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => td.textContent.includes("登记机关"));
if (
registrationElements.length > 0 &&
registrationElements[0].previousElementSibling
) {
const valueCell = registrationElements[0].previousElementSibling;
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
const match = rawText.match(/(\d+人)/);
return match ? match[0] : null;
}
return null;
}
// 获取核准日期
getApprovalDate() {
const approvalElements = Array.from(
this.table.querySelectorAll(".poptip-wrap-annual-date")
).filter((el) => el.textContent.includes("核准日期"));
if (approvalElements.length > 0) {
const valueCell = approvalElements[0].closest("td").nextElementSibling;
if (valueCell) {
const rawValue = valueCell.textContent
.replace(/[\r\n\t]/g, "")
.trim();
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
return rawValue;
}
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "核准日期");
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
const rawValue = ToolUtils.cleanText(valueCell.textContent);
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
return rawValue;
}
}
return null;
}
// 解析公司信息主方法
parseCompanyInfo() {
if (!this.initTable()) return;
const companyData = {
企业名称: this.getOptimizedValue("企业名称"),
统一社会信用代码: this.getUnifiedSocialCreditCode(),
法定代表人: this.getLegalRepresentative(),
电话: this.getPhoneNumber(),
经营状态: this.getOptimizedValue("经营状态"),
成立日期: this.getOptimizedValue("成立日期"),
行政区划: this.getOptimizedValue("行政区划"),
注册资本: this.getOptimizedValue("注册资本"),
实缴资本: this.getOptimizedValue("实缴资本"),
企业类型: this.getOptimizedValue("企业类型"),
所属行业: this.getOptimizedValue("所属行业"),
工商注册号: this.getBusinessRegistrationNo(),
组织机构代码: this.getOrganizationCode(),
纳税人识别号: this.getTaxpayerId(),
纳税人资质: this.getOptimizedValue("纳税人资质"),
营业期限: this.getOptimizedValue("营业期限"),
核准日期: this.getApprovalDate(),
参保人数: this.getInsuranceNumber(),
登记机关: this.getOptimizedValue("登记机关"),
曾用名: this.getOptimizedValue("曾用名"),
注册地址: this.getOptimizedValue("注册地址"),
经营范围: this.getOptimizedValue("经营范围"),
};
ToolUtils.showResult(companyData);
}
}
// QCC解析类 企查查
class QCCParser {
constructor() {
this.table = null;
}
// 初始化表格
initTable() {
const cominfoNormal = document.querySelector("div.cominfo-normal");
if (!cominfoNormal) {
alert("未找到企业信息容器");
return false;
}
this.table = cominfoNormal.querySelector("table.ntable");
if (!this.table) {
alert("未找到企业信息表格");
return false;
}
return true;
}
// 获取优化后的值
getOptimizedValue(title) {
const headerCells = Array.from(this.table.querySelectorAll("td.tb"));
let value = null;
headerCells.forEach((header) => {
if (ToolUtils.cleanText(header.textContent).includes(title)) {
const valueCell = header.nextElementSibling;
if (valueCell) {
// 尝试从copy-value类中获取值
const copyValue = valueCell.querySelector(".copy-value");
if (copyValue) {
value = ToolUtils.cleanText(copyValue.textContent);
} else {
value = ToolUtils.cleanText(valueCell.textContent);
}
}
}
});
return value;
}
// 获取法定代表人
getLegalRepresentative() {
// Try the existing method first
const basicValue =
this.getOptimizedValue("法定代表人") || this.getOptimizedValue("法人");
if (basicValue && basicValue.trim()) {
// Remove any "关联企业 X" text
return basicValue.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// If basic extraction fails, try more specific approach
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
(cell) => ToolUtils.cleanText(cell.textContent).includes("法定代表人")
);
if (!headerCell) return null;
const valueCell = headerCell.nextElementSibling;
if (!valueCell) return null;
// Try to find the name within the complex structure
// Look for anchor tags with target="_blank" which typically contain the legal representative's name
const nameLinks = valueCell.querySelectorAll('a[target="_blank"]');
for (const link of nameLinks) {
const name = ToolUtils.cleanText(link.textContent);
// Make sure it's not empty and doesn't contain obvious non-name text
if (name && !name.includes("关联企业") && !name.includes("复制")) {
return name;
}
}
// Alternative approach - look for the first anchor tag in the cell
const firstLink = valueCell.querySelector("a");
if (firstLink) {
const name = ToolUtils.cleanText(firstLink.textContent);
// Remove any trailing "关联企业 X" text
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// Fallback to general value extraction
const copyValue = valueCell.querySelector(".copy-value");
if (copyValue) {
const name = ToolUtils.cleanText(copyValue.textContent);
// Remove any trailing "关联企业 X" text
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
}
const rawText = ToolUtils.cleanText(valueCell.textContent);
// Remove any trailing "关联企业 X" text
return rawText.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// 获取统一社会信用代码
getUnifiedSocialCreditCode() {
return (
this.getOptimizedValue("统一社会信用代码") ||
this.getOptimizedValue("信用代码")
);
}
// 获取工商注册号
getBusinessRegistrationNo() {
return (
this.getOptimizedValue("工商注册号") || this.getOptimizedValue("注册号")
);
}
// 获取组织机构代码
getOrganizationCode() {
return this.getOptimizedValue("组织机构代码");
}
// 获取纳税人识别号
getTaxpayerId() {
return (
this.getOptimizedValue("纳税人识别号") ||
this.getUnifiedSocialCreditCode()
);
}
// 获取参保人数
getInsuranceNumber() {
// 查找参保人数表头
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
(cell) => ToolUtils.cleanText(cell.textContent).includes("参保人数")
);
if (!headerCell) return null;
const valueCell = headerCell.nextElementSibling;
if (!valueCell) return null;
// 提取参保人数数字
const numberSpan = valueCell.querySelector("span");
const number = numberSpan
? ToolUtils.cleanText(numberSpan.textContent)
: null;
// 提取年报年份
const reportLink = valueCell.querySelector("a.m-l-r-10");
const reportYear = reportLink
? ToolUtils.cleanText(reportLink.textContent)
: "";
// 组合结果
return number ? `${number}${reportYear}` : null;
}
// 获取联系电话
getPhoneNumber() {
// 查找联系信息容器
const contactInfo = document.querySelector("div.contact-info");
if (!contactInfo) return null;
// 查找右侧信息区域
const rightPart = contactInfo.querySelector("div.main-part-item.right");
if (!rightPart) return null;
// 查找包含电话的行
const rows = Array.from(rightPart.querySelectorAll("div.rline"));
const phoneRow = rows.find((row) =>
ToolUtils.cleanText(row.textContent).includes("电话:")
);
if (!phoneRow) return null;
// 提取电话号码
const spans = Array.from(
phoneRow.querySelectorAll("span.need-copy-field")
);
const phoneSpan = spans.find(
(span) => !ToolUtils.cleanText(span.textContent).includes("电话:")
);
return phoneSpan ? ToolUtils.cleanText(phoneSpan.textContent) : null;
}
// 获取核准日期
getApprovalDate() {
return (
this.getOptimizedValue("核准日期") || this.getOptimizedValue("成立日期")
);
}
// 解析公司信息主方法
parseCompanyInfo() {
if (!this.initTable()) return;
const companyData = {
企业名称:
this.getOptimizedValue("企业名称") ||
this.getOptimizedValue("公司名称"),
统一社会信用代码: this.getUnifiedSocialCreditCode(),
法定代表人: this.getLegalRepresentative(),
经营状态: this.getOptimizedValue("登记状态"),
成立日期: this.getOptimizedValue("成立日期"),
行政区划: this.getOptimizedValue("行政区划"),
注册资本: this.getOptimizedValue("注册资本"),
实缴资本: this.getOptimizedValue("实缴资本"),
企业类型: this.getOptimizedValue("企业类型"),
所属行业: this.getOptimizedValue("国标行业"),
工商注册号: this.getBusinessRegistrationNo(),
组织机构代码: this.getOrganizationCode(),
纳税人识别号: this.getTaxpayerId(),
纳税人资质: this.getOptimizedValue("纳税人资质"),
营业期限: this.getOptimizedValue("营业期限"),
核准日期: this.getApprovalDate(),
参保人数: this.getInsuranceNumber(),
电话: this.getPhoneNumber(),
登记机关: this.getOptimizedValue("登记机关"),
曾用名: this.getOptimizedValue("曾用名"),
注册地址: this.getOptimizedValue("注册地址"),
经营范围: this.getOptimizedValue("经营范围"),
};
ToolUtils.showResult(companyData);
}
}
// 创建按钮容器
function createButtonContainer() {
const container = document.createElement("div");
container.id = "tool-container";
Object.assign(container.style, {
position: "fixed",
right: "20px",
bottom: "20px",
zIndex: "9999",
display: "flex",
flexDirection: "column",
gap: "10px",
width: "40px",
height: "40px",
backgroundColor: "#4CAF50",
borderRadius: "50%",
transition: "all 0.3s ease",
overflow: "hidden",
cursor: "move",
});
// +号指示器
const plusSign = document.createElement("div");
plusSign.textContent = "+";
Object.assign(plusSign.style, {
color: "white",
fontSize: "24px",
textAlign: "center",
lineHeight: "40px",
width: "100%",
});
container.appendChild(plusSign);
// 悬停展开效果
container.addEventListener("mouseenter", () => {
container.style.width = "150px";
container.style.height = "auto";
container.style.borderRadius = "8px";
});
container.addEventListener("mouseleave", () => {
container.style.width = "40px";
container.style.height = "40px";
container.style.borderRadius = "50%";
});
// 添加拖动功能
let isDragging = false;
let offsetX, offsetY;
// 鼠标按下开始拖动
container.addEventListener("mousedown", (e) => {
// 只有点击+号区域才允许拖动
if (e.target === plusSign || e.target === container) {
isDragging = true;
const rect = container.getBoundingClientRect();
offsetX = e.clientX - rect.left;
offsetY = e.clientY - rect.top;
container.style.cursor = "grabbing";
// 阻止事件冒泡和默认行为
e.stopPropagation();
e.preventDefault();
}
});
// 鼠标移动时更新位置
document.addEventListener("mousemove", (e) => {
if (!isDragging) return;
container.style.left = e.clientX - offsetX + "px";
container.style.top = e.clientY - offsetY + "px";
container.style.right = "auto";
container.style.bottom = "auto";
});
// 鼠标释放结束拖动
document.addEventListener("mouseup", () => {
if (isDragging) {
isDragging = false;
container.style.cursor = "move";
}
});
// 创建功能按钮
function createButton(text, onClick) {
const button = document.createElement("button");
button.textContent = text;
Object.assign(button.style, {
padding: "8px 12px",
border: "none",
borderRadius: "4px",
backgroundColor: "white",
color: "#333",
cursor: "pointer",
width: "100%",
transition: "backgroundColor 0.2s",
});
button.addEventListener(
"mouseenter",
() => (button.style.backgroundColor = "#f0f0f0")
);
button.addEventListener(
"mouseleave",
() => (button.style.backgroundColor = "white")
);
button.addEventListener("click", onClick);
return button;
}
// 复制源码按钮
const copySourceButton = createButton("复制源码", () => {
const html = document.documentElement.outerHTML;
copyToClipboard(html, "HTML源码已复制到剪贴板");
/*
navigator.clipboard
.writeText(html)
.then(() => {
alert("源码已复制到剪贴板");
})
.catch((err) => {
console.error("复制失败:", err);
});
*/
});
// 解析公司信息按钮
const parseInfoButton = createButton("解析公司信息", () => {
// 根据当前URL选择对应的解析器
let parser;
if (window.location.host.includes("aiqicha.baidu.com")) {
parser = new AiQiChaParser();
} else if (window.location.host.includes("qcc.com")) {
parser = new QCCParser();
} else {
alert("不支持的网站");
return;
}
parser.parseCompanyInfo();
});
// 添加按钮到容器
container.appendChild(copySourceButton);
container.appendChild(parseInfoButton);
document.body.appendChild(container);
}
// 页面加载完成后创建按钮
window.addEventListener("load", createButtonContainer);
})();

23
config.py Normal file
View File

@@ -0,0 +1,23 @@
# 最好写入三家搜索引擎登录后的cookie
bingheaders = {
'cookie': """""",
'referer': 'https://cn.bing.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4051.0 Safari/537.36 Edg/82.0.425.0'}
baiduheaders = {
'Cookie': """PSTM=1755051967; BAIDUID=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; BIDUPSID=BFDEAE9917763352A1CF94FF7A9AD50F; BD_UPN=12314753; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=9623ABA6AF15935E519C6D57EB04D5BD:FG=1; ZFY=LX6tLiXJLyE8Spg0Tn3yWYhYWOqUXgNuD45NXzSsgDY:C; baikeVisitId=6e4f6130-a8eb-49b3-8413-1815a6af31a3; BD_HOME=1; ppfuid=FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGlT/s3qQuIlmw0dmIvm22ZTGEimjy3MrXEpSuItnI4KDyGSNvJz3OVxhMd6l0BD7nHci+eNtO+sUfx41sINYk+w3il4JkBUe91yGyLjoc4piSRx4OH9u8PLj7EqnTyQEyOWgTqV0RFcOD/4ANUzZZkGhGlPjfasITJONp0AJTY8kGLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRPAn9I0uOqAMff6fuUeWcH0OjH2+RiDANKDxQc+RdNr2uC5D1fu00TizBtFeq9APvs5FjnYxYstXg/9EfB3EVmJIvdK3BvFGk0IgcgSSzt63lV1Uhhp5FAe6gNJIUptp7EMAaXYKm11G+JVPszQFdp9AJLcm4YSsYUXkaPI2Tl66J246cmjWQDTahAOINR5rXR5r/7VVI1RMZ8gb40q7az7vCK56XLooKT5a+rsFrf5Zu0yyCiiagElhrTEOtNdBJJq8eHwEHuFBni9ahSwpC7lbKkUwaKH69tf0DFV7hJROiLETSFloIVkHdy3+I2JUr1LsplAz0hMkWt/tE4tXVUV7QcTDTZWS/2mCoS/GV3N9awQ6iM6hs/BWjlgnEa1+5gbcves5wJ6gbk0b0Avk9wGRtTVVEE/aHCSd+6WFfR1C5FKazXcZ/j40FJv+iLGBn3nkkgHlne61I8I7KhtQgIkmBMJIjPMkS/L051MeqdGScsKYTJuSucgI5c3+79eVH+y2TvbOTuuHv1uGxwXFb2atIU1ZYPbmmXculmizKcKIUiL64VMhr/ZycHJ3jpdZlyprBJR80ygAVuGrjl4whGbgBRkDPTwtXjYtgzmW74m0fDU2MZaxpBZZF8YurfocYcmDdcxFKeoIFQmVqAoAU+3YcXQt2xKThZZyV1v3sCvnzidUZtKM9cRRUfRWBtQSb50APM+gs/408xg7KHCB8AOKpZpfIpPhQ0RJhew8GR0aTqYsJo1IRCwM3UbbrvtJ7eqPMNzJcGcSYcQWm1FubInMonve94c+p8Vi2wc72MfReeFiTzMp1G6pDt2e40gPDGbdQI+jba4UjRlyA+9CbTW6Mt45W/80hW/gFEKh9+Klyky6FPenbJgt/vQK9TAiTA==; BDUSS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; BDUSS_BFESS=o4ZFV6UTVucGp0Rmx6TlNucFQ1Z1FEMnQyZ1ZOdmRUZWg2Nn5FQWxteWdBTVZvSVFBQUFBJCQAAAAAAAAAAAEAAAAXn3lCu8PRqdTGtssAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKBznWigc51oZW; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22370464293%22%2C%22first_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%22198a7105b40582-0a1b1b0944bf378-4c657b58-1440000-198a7105b4183a%22%7D; MCITY=-179%3A; log_first_time=1755482524636; log_last_time=1755482544322; RT="z=1&dm=baidu.com&si=1403e7da-9af8-439d-bdca-61f492a1b52a&ss=mecm9ry0&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=jhi&ul=3upx87&hd=3upxa3"; H_PS_PSSID=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BA_HECTOR=048lak8h81218h8h8020850k80a00g1ka54mp25; H_WISE_SIDS=62325_63147_63327_63948_64048_64174_64248_64245_64258_64260_64317_64358_64366_64362_64363_64395_64414_64429_64436_64442_64450_64457_64473_64483_64502_64512_64448_64087_64559_64571; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; COOKIE_SESSION=21_0_8_9_13_23_0_1_8_9_1_6_498875_0_0_0_1754446941_0_1755485313%7C9%231543375_16_1753882701%7C7; H_PS_645EC=1275d4%2BgYNOGPU5%2Fgp6XcloUiDEOGWs8LNx7nISyDCmJSXMYxQLNnwJypIA""",
'Host': 'www.baidu.com',
'referer': 'https://www.baidu.com/s',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0'
}
googleheaders = {
'cookie': """""",
'referer': 'https://www.google.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
# 爬取谷歌需要代理
proxy='http://127.0.0.1:7897'

69
data.csv Normal file
View File

@@ -0,0 +1,69 @@
杭州辉煌物业管理有限公司
杭州辉望科技有限公司
浙江八方电信科技集团有限公司
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)
瑞凤九天(杭州)科技有限公司
金码智能科技(杭州)有限公司
OPPO广东移动通信有限公司
杭州巨量引擎网络技术有限公司
杭州绿城衡宇环境设计有限公司
新疆浙疆果业有限公司
杭州未知数品牌管理有限公司
杭州慈山科技有限公司
杭州扬拓体育科技有限公司
杭州乂游网络科技有限公司
杭州钱橙似锦科技有限公司
杭州奥得徕贸易有限公司
杭州伍壹荟旅游咨资询有限公司
杭州心满意定供应链服务有限公司
杭州麒晨科技有限公司
杭州羊咩咩文化传媒有限公司
杭州禾露则正生物科技有限公司
浙江商盟支付有限公司
天禄(杭州)科技有限公司
如是启创(杭州)科技有限公司
杭州音视贝科技有限公司
杭州千骏轴承有限公司
杭州锐擎科技有限公司
浙江力一科技有限公司
煜邦电力智能装备(嘉兴)有限公司
杭州裕阳经营管理合伙企业(有限合伙)
氧气.康复中心
杭州云迹物联科技有限公司
杭州着墨文化创意者限公司
亚信科技(南京)有限公司
杭州密尔沃智能装备有限公司
杭州骏远电子商务有限公司
杭州一喂智能科技有限公司
杭州孚伦特科技有限公司
杭州人谋天成科技有限公司
杭州瑾馨贸易集团有限公司
杭州琑为缘文化艺术有限公司
浙江丝里伯睡眠科技股份有限公司
杭州倍驰科技有限公司
杭州心灵部落教育有限公司(灵动生活)
杭州云印智造科技有限公司
浙江海拓环境技术有限公司
申能环境科技有限公司
医贝云服(杭州)科技有限公司
杭州甬盛通信技术有限公司
杭州字节跳动科技有限公司
杭州邻汇网络科技有限公司
浙江建盛安全科技有限公司
幻想集团·杭州运营中心
杭州阿克莱斯设备有限公司
浙江省现代农业促进会
益思芯科技(杭州)有限公司
杭州霖思网络科技有限公司
杭州星瀚知识产权代理有限公司
风华(杭州)信息技术有限公司
杭州晓羽科技有限公司
浙江根旺律师事务所
远大住宅工业(杭州)有限公司
浙江全应科技有限公司
杭州塞牧文化传媒有限公司
浙江彩屋信息技术有限公司
杭州瑞泡特教育科技有限公司
杭州贝享健康科技有限公司
杭州摸象大数据科技有限公司
杭州颐刻生物科技有限公司
1 杭州辉煌物业管理有限公司
2 杭州辉望科技有限公司
3 浙江八方电信科技集团有限公司
4 中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)
5 瑞凤九天(杭州)科技有限公司
6 金码智能科技(杭州)有限公司
7 OPPO广东移动通信有限公司
8 杭州巨量引擎网络技术有限公司
9 杭州绿城衡宇环境设计有限公司
10 新疆浙疆果业有限公司
11 杭州未知数品牌管理有限公司
12 杭州慈山科技有限公司
13 杭州扬拓体育科技有限公司
14 杭州乂游网络科技有限公司
15 杭州钱橙似锦科技有限公司
16 杭州奥得徕贸易有限公司
17 杭州伍壹荟旅游咨资询有限公司
18 杭州心满意定供应链服务有限公司
19 杭州麒晨科技有限公司
20 杭州羊咩咩文化传媒有限公司
21 杭州禾露则正生物科技有限公司
22 浙江商盟支付有限公司
23 天禄(杭州)科技有限公司
24 如是启创(杭州)科技有限公司
25 杭州音视贝科技有限公司
26 杭州千骏轴承有限公司
27 杭州锐擎科技有限公司
28 浙江力一科技有限公司
29 煜邦电力智能装备(嘉兴)有限公司
30 杭州裕阳经营管理合伙企业(有限合伙)
31 氧气.康复中心
32 杭州云迹物联科技有限公司
33 杭州着墨文化创意者限公司
34 亚信科技(南京)有限公司
35 杭州密尔沃智能装备有限公司
36 杭州骏远电子商务有限公司
37 杭州一喂智能科技有限公司
38 杭州孚伦特科技有限公司
39 杭州人谋天成科技有限公司
40 杭州瑾馨贸易集团有限公司
41 杭州琑为缘文化艺术有限公司
42 浙江丝里伯睡眠科技股份有限公司
43 杭州倍驰科技有限公司
44 杭州心灵部落教育有限公司(灵动生活)
45 杭州云印智造科技有限公司
46 浙江海拓环境技术有限公司
47 申能环境科技有限公司
48 医贝云服(杭州)科技有限公司
49 杭州甬盛通信技术有限公司
50 杭州字节跳动科技有限公司
51 杭州邻汇网络科技有限公司
52 浙江建盛安全科技有限公司
53 幻想集团·杭州运营中心
54 杭州阿克莱斯设备有限公司
55 浙江省现代农业促进会
56 益思芯科技(杭州)有限公司
57 杭州霖思网络科技有限公司
58 杭州星瀚知识产权代理有限公司
59 风华(杭州)信息技术有限公司
60 杭州晓羽科技有限公司
61 浙江根旺律师事务所
62 远大住宅工业(杭州)有限公司
63 浙江全应科技有限公司
64 杭州塞牧文化传媒有限公司
65 浙江彩屋信息技术有限公司
66 杭州瑞泡特教育科技有限公司
67 杭州贝享健康科技有限公司
68 杭州摸象大数据科技有限公司
69 杭州颐刻生物科技有限公司

1
freeze.bat Normal file
View File

@@ -0,0 +1 @@
pip freeze > requirements.txt

2
install_requirements.bat Normal file
View File

@@ -0,0 +1,2 @@
pip install -r requirements.txt
python.exe -m pip install --upgrade pip

138
main.py Normal file
View File

@@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
import asyncio
import random
import aiohttp
import time
import sys
from bs4 import BeautifulSoup
import re
import aiofiles
import urllib.parse
import argparse
from colorama import init, Fore
from search import Bing,Baidu
import openpyxl
import ssl
from tool.read_csv import CSVReader
start = time.time()
def printascii():
# 初始化
init()
# 设置颜色
print(Fore.GREEN + r'''
____ _
/ ___| ___ __ _ _ __ ___| |__ ___ _ __
\___ \ / _ \/ _` | '__/ __| '_ \ / _ \ '__|
___) | __/ (_| | | | (__| | | | __/ |
|____/ \___|\__,_|_| \___|_| |_|\___|_|
''' + Fore.RESET)
# 天欣安全实验室
def writeExcel(titles, links,ws):
infos = list(zip(titles, links))
for row in infos:
ws.append(row)
def create_sheet_and_write(wb, engine, keywords, num, title):
ws = wb.create_sheet(title=title)
result = engine(keywords, num)
writeExcel(result[0], result[1], ws)
def excel_text2url(link_url): #如果函数内部没有进行异步操作,使用 async 并不会对性能或功能产生实际影响。
'''把一个网址字符串转换为 Excel公式使其可以点击直接转跳'''
return f'=HYPERLINK("{link_url}","{link_url}")'
# 遍历所有工作表,并将第二列的所有数据传递给 excel_text2url 函数重新赋值
def update_hyperlinks(wb):
for sheet in wb.worksheets: # 遍历每一个工作表
for row in sheet.iter_rows(min_row=1, max_row=sheet.max_row, min_col=2, max_col=2): # 遍历第二列
for cell in row:
if cell.value: # 检查单元格是否有内容
cell.value = excel_text2url(cell.value) # 将网址转换为超链接公式
else:
break
def commend():
parser = argparse.ArgumentParser(prog="Searcher", description='此工具用于对百度、必应和谷歌搜索的协程爬取--天欣安全实验室', usage='please read -h')
parser.add_argument("-k", type=str, help="搜索的关键词", nargs='+')
# 添加一个positional arguments叫a,读取类型为int默认是字符串
parser.add_argument("-p", type=str, help="需要搜索页数,默认为5,支持范围搜索例如搜索从第2页到第五页的参数为 2:5", default='5')
parser.add_argument("-m", type=str, help="使用的搜索引擎:百度:bd,必应:bin,谷歌:goo 不填写默认使用全部", default='all',nargs='+')
# parser.add_argument("-t", '--task', type=int, help="设置的线程,默认为8", default=8)
parser.exit_on_error = False
args = parser.parse_args()
if len(sys.argv) == 1:
printascii()
parser.print_help()
sys.exit()
return args
def search_company_info(company_name_arg, num):
keywords = company_name_arg
# for key in keyword:
# keywords = keywords + key + " "
keywords = keywords.strip()
result = Bing.bing_main(keywords, num)
# for 循环 遍历 result[0] 和 result[1]
data_list =[]
for i in range(len(result[0])):
title= result[0][i]
url = result[1][i]
print(f"必应搜索爬取结果为,title:{title}, url:{url}")
if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url):
data_list.append([title, url])
return data_list
def filter_company_sites(urls):
# urls https://www.tianyancha.com/company/5226478758
# url:https://aiqicha.baidu.com/company_detail_26602790857925
# url:https://www.qcc.com/firm/05b449eb5cc417d0f97c14104051f5c0.html
# 匹配 前缀https://aiqicha.baidu.com/company_detail_*,https://www.qcc.com/firm/*.html,https://www.tianyancha.com/company/5226478758*
filtered_urls = [url for url in urls if re.match(r"^https://aiqicha.baidu.com/company_detail_.*|https://www.qcc.com/firm/.*|https://www.tianyancha.com/company/.*", url)]
return filtered_urls
def search_one_company(company_name_arg, num):
keywords = company_name_arg
# for key in keyword:
# keywords = keywords + key + " "
keywords = keywords.strip()
print(f"您搜索的关键词为:{keywords}")
wb = openpyxl.Workbook()
# 删除默认创建的工作表(现在名为 "数据表1"
wb.remove(wb['Sheet'])
printascii()
pattern = r"[\\/:\*\?\"<>|]"
keyword = re.sub(pattern, "", keywords)
create_sheet_and_write(wb, Bing.bing_main, keywords, num, "必应爬取结果")
create_sheet_and_write(wb, Baidu.baidu_main, keywords, num, "百度爬取结果")
# 将所有url变为超链接,点击即可打开转跳
update_hyperlinks(wb)
wb.save(f'./{keyword}-{company_name_arg}.xlsx')
print(Fore.GREEN + '总任务结束!' + Fore.RESET)
end = time.time()
print(Fore.RED + f'脚本总时间: {end - start:.2f}')
if __name__ == '__main__':
reader = CSVReader('data.csv')
company_names = reader.read_column(0, has_header=False)
print("所有数据:", company_names)
i= 1
for company_name in company_names:
sleep_time = 5
sleep_time += random.randint(1, 5)
time.sleep(sleep_time)
company_name += " 爱企查|企查查"
data_list = search_company_info(company_name, '1')
print(data_list)
i=i+1
if i > 1:
break

20
requirements.txt Normal file
View File

@@ -0,0 +1,20 @@
aiofiles==24.1.0
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
async-timeout==4.0.3
attrs==24.2.0
beautifulsoup4==4.12.3
colorama==0.4.6
et-xmlfile==1.1.0
frozenlist==1.4.1
greenlet==3.2.4
idna==3.8
lxml==5.3.0
multidict==6.1.0
openpyxl==3.1.5
playwright==1.55.0
pyee==13.0.0
soupsieve==2.6
typing_extensions==4.12.2
yarl==1.11.1

128
search/Baidu.py Normal file
View File

@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
import os
import asyncio
import aiohttp
import time
import sys
from bs4 import BeautifulSoup
import re
import aiofiles
import urllib.parse
import argparse
from colorama import init, Fore
import ssl
from urllib.parse import quote
# 添加项目根目录到 sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
baiduheaders=config.baiduheaders
timeout = aiohttp.ClientTimeout(
total=None, # 总超时
sock_connect=5.5, # 连接超时时间5.5
sock_read=5.5 # 读取超时为5.5秒
)
#--天欣安全实验室--#
# 初次请求获取百度加密后的url
async def getfirstinfo(keyword, pn,session):
sslcontext = ssl.create_default_context()
sslcontext.check_hostname = False
sslcontext.verify_mode = ssl.CERT_NONE
titlelist = []
fakeurl = []
url = f'https://www.baidu.com/s?wd={keyword}&pn={pn}'
# print("正在爬取的url为:"+url)
j=0
while j<3:
try:
async with session.get(url, headers=baiduheaders, ssl=sslcontext,timeout=timeout) as resp:
html = await resp.text()
soup = BeautifulSoup(html, 'lxml')
h3 = soup.select('h3.t')
for h3 in h3:
h3text = h3.text.replace('\n', '').replace(',', ' ').replace('\ue636', '').strip()
titlelist.append(h3text) #保存h3标签内的文字内容
fakeurl.append(h3.a.get('href')) #获取h3下a标签的href链接此链接为百度跳转链接需要处理
return titlelist, fakeurl
except Exception as e:
# print(e)
print("baidu链接失败正在重新尝试...")
j=j+1
print(f"百度任务出错:{url}该url无法正常获取数据。")
return [],[]
# 再次请求获取真实的网站url
async def gettrueurl(url,printtitle,session):
try:
domain = 'https://www.baidu.com/'
# async with aiohttp.ClientSession() as session:
async with session.get(url, headers=baiduheaders, allow_redirects=False) as resp:
await resp.text()
if str(resp.headers.get('Location')) != None and str(resp.headers.get('Location')) != '':
trueurl=str(resp.headers.get('Location'))
print(printtitle," ",trueurl)
return trueurl
else:
print(url + '该url无法转跳')
url = urllib.parse.urljoin(domain, url)
print(printtitle, " ",url)
return url
except:
return url
async def baidu_spinder(keyword, num):
print(f'百度爬取任务进行中,爬取页数为{num}...')
urllist = []
titlelist = []
tasks1 = []
tasks2 = []
Source = []
if ':' in num:
if num.count(':') > 1:
raise ValueError("输入中必须且只能包含一个 ':'")
else:
# 分割字符串,确保分割后的两部分都是数字
start_page, end_page = num.split(':')
# 判断两边是否都是数字
if not (start_page.isdigit() and end_page.isdigit()):
raise ValueError("':' 两侧的值必须是数字")
else:
start_page = (int(start_page) - 1) * 10
end_page = (int(end_page)) * 10
else:
start_page, end_page = 0, int(num) * 10
async with aiohttp.ClientSession() as session:
for i, pn in enumerate(range(start_page, end_page, 10)):
tasks1 = tasks1 + [asyncio.create_task(getfirstinfo(keyword, pn,session))]
result = await asyncio.gather(*tasks1)
async with aiohttp.ClientSession() as session:
for i in range(int((end_page-start_page) / 10)):
titlelist += result[i][0]
for j,url in enumerate(result[i][1]):
printtitle=result[i][0][j]
if not url.startswith(('http://', 'https://')):
domain = 'http://www.baidu.com/'
url = urllib.parse.urljoin(domain, url)
tasks2 = tasks2 + [asyncio.create_task(gettrueurl(url,printtitle,session))]
print('标题\t URL\t')
urllist += await asyncio.gather(*tasks2)
count = len(urllist)
print(f"百度搜索爬取结果数量为{count}")
print(Fore.GREEN + '百度爬取任务完成!\n' + Fore.RESET)
return titlelist, urllist
# await baiduwriteCSV(titlelist, urllist, keyword)
def baidu_main(keyword, num):
keyword = quote(keyword)
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
loop = asyncio.get_event_loop()
return loop.run_until_complete(baidu_spinder(keyword, num))
async def Baidu_main(keywords, num):
return await baidu_spinder(keywords, num)

90
search/Bing.py Normal file
View File

@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
import asyncio
import os
import sys
import urllib.parse
from urllib.parse import quote
import aiohttp
from bs4 import BeautifulSoup
from colorama import Fore
# 添加项目根目录到 sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import config
bingheaders=config.bingheaders
proxy=config.proxy
timeout = aiohttp.ClientTimeout(
total=None, # 总超时
sock_connect=5.5, # 连接超时时间5.5
sock_read=5.5 # 读取超时为5.5秒
)
async def getbing(url, session):
url_list = []
title_list = []
async with session.get(url, headers=bingheaders,timeout=timeout) as resp:
# print("正在爬取url:"+url)
try:
a = await resp.text()
soup = BeautifulSoup(a, 'lxml')
h2a = soup.select('h2 a')
for h in h2a:
htext = h.text.replace('\n', '').replace(',', ' ').strip()
hurl=h.get('href')
if not hurl.startswith(('http://', 'https://')):
domain = 'https://cn.bing.com/'
hurl = urllib.parse.urljoin(domain, hurl)
print(htext," ",hurl)
title_list.append(htext)
url_list.append(hurl)
except:
print(f"必应页面爬取失败,{url}该url无法正常获取数据。")
return [],[]
return url_list, title_list
async def bing_spinder(keyword, num):
print(f'必应爬取任务进行中,爬取页数为{num}...')
print('标题 url')
urllist = []
titlelist = []
tasks = []
if ':' in num:
if num.count(':') > 1:
raise ValueError("输入中必须且只能包含一个 ':'")
else:
# 分割字符串,确保分割后的两部分都是数字
start_page, end_page = num.split(':')
# 判断两边是否都是数字
if not (start_page.isdigit() and end_page.isdigit()):
raise ValueError("':' 两侧的值必须是数字")
else:
start_page = (int(start_page)-1)*10
end_page = (int(end_page))*10
else:
start_page, end_page =0,int(num) * 10
async with aiohttp.ClientSession() as session:
for pn in range(start_page, end_page, 10):
#url = f'https://cn.bing.com/search?q={keyword}&first={pn}&mkt=zh-CN'
url = f'https://cn.bing.com/search?q={keyword}&qs=n&form=QBRE&sp=-1&lq=0'
# print("正在爬取的url为:"+url)
tasks = tasks + [asyncio.create_task(getbing(url, session))]
result = await asyncio.gather(*tasks)
for i in range(int((end_page-start_page) / 10)):
urllist += result[i][0]
titlelist += result[i][1]
count=len(urllist)
print(f"必应搜索爬取结果为{count}")
print(Fore.GREEN + '必应爬取任务完成\n' + Fore.RESET)
return titlelist, urllist
# await bingwriteCSV(titlelist, urllist, keyword)
def bing_main(keyword,num):
keyword = quote(keyword)
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
loop = asyncio.get_event_loop()
return loop.run_until_complete(bing_spinder(keyword,num))
async def Bing_main(keywords, num):
return await bing_spinder(keywords, num)

Binary file not shown.

Binary file not shown.

Binary file not shown.

190
tool/read_csv.py Normal file
View File

@@ -0,0 +1,190 @@
import csv
from typing import List, Dict, Union, Any, Optional
class CSVReader:
"""
CSV文件读取工具类
支持有表头和无表头模式,可按列索引或表头字段名返回数据
"""
def __init__(self, file_path: str):
"""
初始化CSV读取器
Args:
file_path: CSV文件路径
"""
self.file_path = file_path
def read(self, has_header: bool = True, encoding: str = 'utf-8') -> List[Dict[str, Any]]:
"""
读取CSV文件并返回查询结果
Args:
has_header: 是否有表头默认为True
encoding: 文件编码默认为utf-8
Returns:
List[Dict[str, Any]]: 查询结果列表,每个元素是一行数据的字典表示
"""
data = []
with open(self.file_path, 'r', encoding=encoding) as file:
if has_header:
# 使用DictReader处理有表头的CSV
reader = csv.DictReader(file)
for row in reader:
data.append(dict(row))
else:
# 使用普通reader处理无表头的CSV
reader = csv.reader(file)
for row in reader:
# 为无表头的行创建字典,使用列索引作为键
row_dict = {f'col_{i}': value for i, value in enumerate(row)}
data.append(row_dict)
return data
def read_column(self, column: Union[int, str], has_header: bool = True,
encoding: str = 'utf-8') -> List[Any]:
"""
读取指定列的数据
Args:
column: 列索引(从0开始)或列名
has_header: 是否有表头
encoding: 文件编码
Returns:
List[Any]: 指定列的数据列表
"""
data = self.read(has_header, encoding)
if has_header and isinstance(column, str):
# 有表头且指定了列名
return [row[column] for row in data]
elif isinstance(column, int):
# 指定了列索引
if has_header:
# 有表头时需要获取列名
if data:
keys = list(data[0].keys())
if 0 <= column < len(keys):
column_name = keys[column]
return [row[column_name] for row in data]
else:
raise IndexError(f"Column index {column} out of range")
else:
return []
else:
# 无表头时使用默认列名
column_name = f'col_{column}'
return [row[column_name] for row in data if column_name in row]
else:
raise ValueError("Invalid column parameter")
def read_columns(self, columns: Dict[str, Union[int, str]], has_header: bool = True,
encoding: str = 'utf-8') -> List[Dict[str, Any]]:
"""
读取指定的多列数据,可以重命名列名
Args:
columns: 字典键为返回结果中的列名值为原CSV中的列索引或列名
has_header: 是否有表头
encoding: 文件编码
Returns:
List[Dict[str, Any]]: 指定列的数据列表
"""
all_data = self.read(has_header, encoding)
result = []
# 获取所有列名
if all_data:
header_keys = list(all_data[0].keys())
else:
header_keys = []
for row in all_data:
new_row = {}
for new_name, old_column in columns.items():
if isinstance(old_column, str) and has_header:
# 按列名获取值
new_row[new_name] = row.get(old_column, '')
elif isinstance(old_column, int):
# 按列索引获取值
if has_header:
if 0 <= old_column < len(header_keys):
key = header_keys[old_column]
new_row[new_name] = row.get(key, '')
else:
new_row[new_name] = ''
else:
key = f'col_{old_column}'
new_row[new_name] = row.get(key, '')
result.append(new_row)
return result
# 使用示例
if __name__ == "__main__":
# 示例1: 有表头的CSV文件
# 假设有一个名为data.csv的文件内容如下:
# name,age,city
# Alice,25,Beijing
# Bob,30,Shanghai
# Charlie,35,Guangzhou
reader = CSVReader('../data.csv')
# 读取所有数据
#all_data = reader.read(has_header=False)
# 读取所有数据
# all_data = reader.read(has_header=True)
# print("所有数据:", all_data)
selected_data_no_header = reader.read_columns({
'company_name': 0
}, has_header=False)
print("所有数据:", selected_data_no_header)
selected_data_no_header = reader.read_column(0, has_header=False)
print("所有数据:", selected_data_no_header)
# # 读取指定列(按列名)
# names = reader.read_column('name', has_header=True)
# print("姓名列:", names)
#
# # 读取指定列(按索引)
# ages = reader.read_column(1, has_header=True)
# print("年龄列:", ages)
#
# # 读取多列并重命名
# selected_data = reader.read_columns({
# '姓名': 'name',
# '年龄': 1
# }, has_header=True)
# print("选择的数据:", selected_data)
#
# # 示例2: 无表头的CSV文件
# # 假设有一个名为data_no_header.csv的文件内容如下:
# # Alice,25,Beijing
# # Bob,30,Shanghai
# # Charlie,35,Guangzhou
#
# reader2 = CSVReader('data_no_header.csv')
#
# # 读取所有数据
# all_data_no_header = reader2.read(has_header=False)
# print("无表头所有数据:", all_data_no_header)
#
# # 读取指定列(按索引)
# first_column = reader2.read_column(0, has_header=False)
# print("第一列:", first_column)
#
# # 读取多列并指定名称
# selected_data_no_header = reader2.read_columns({
# '姓名': 0,
# '城市': 2
# }, has_header=False)
# print("无表头选择的数据:", selected_data_no_header)

Binary file not shown.