Compare commits

...

10 Commits

Author SHA1 Message Date
manchuwork
97bf3eb194 aiqicha 2025-11-03 18:58:20 +08:00
manchuwork
de3c97e828 aiqicha 2025-11-03 18:57:58 +08:00
manchuwork
d3e12e3754 aiqicha 2025-10-04 01:21:51 +08:00
manchuwork
ce414ffc51 aiqicha 2025-10-04 01:19:56 +08:00
manchuwork
6d42506d4e 修改csvtool header 2025-10-04 01:18:05 +08:00
manchuwork
dd368bb645 修改aiqicha login逻辑, cookie文件位置 2025-10-03 07:45:21 +08:00
manchuwork
ecf17dbf1d web browser anti detection 2025-10-03 07:19:26 +08:00
manchuwork
84143ff6fb aiqicha 2025-10-03 03:02:27 +08:00
manchuwork
6444fecd4e aiqicha 2025-09-25 14:47:19 +08:00
manchuwork
54f3beded9 aiqicha 2025-09-25 03:19:50 +08:00
102 changed files with 34015 additions and 85 deletions

27
.gitignore vendored Normal file
View File

@@ -0,0 +1,27 @@
.gitignore
.git/
.idea/
.vscode/
.DS_Store
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
node_modules/
dist/
build/
.next/
.out/
.serverless/
cdk.out/
.vscode-test/
cookies/
data/
cookies/*
data/*

View File

@@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.13 (SearchCompany)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

1
.idea/misc.xml generated
View File

@@ -3,4 +3,5 @@
<component name="Black">
<option name="sdkName" value="Python 3.13 (SearchCompany)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 virtualenv at D:\gitstudy\pythonwork\SearchCompany\.venv" project-jdk-type="Python SDK" />
</project>

View File

@@ -0,0 +1,372 @@
[
{
"name": "BAIDUID",
"value": "08CC3EA21C8988FC5B603A95F1A0AF62:FG=1",
"domain": ".baidu.com",
"path": "/",
"expires": 1789986039.693011,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "BAIDUID_BFESS",
"value": "08CC3EA21C8988FC5B603A95F1A0AF62:FG=1",
"domain": ".baidu.com",
"path": "/",
"expires": 1789986039.693241,
"httpOnly": false,
"secure": true,
"sameSite": "None"
},
{
"name": "HMACCOUNT",
"value": "8C7AB8DF0F9CE07B",
"domain": ".hm.baidu.com",
"path": "/",
"expires": 1793010042.31394,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "HMACCOUNT_BFESS",
"value": "8C7AB8DF0F9CE07B",
"domain": ".hm.baidu.com",
"path": "/",
"expires": 1793010042.314104,
"httpOnly": false,
"secure": true,
"sameSite": "None"
},
{
"name": "Hm_lvt_ad52b306e1ae4557f5d3534cce8f8bbf",
"value": "1758450044",
"domain": ".aiqicha.baidu.com",
"path": "/",
"expires": 1789986067,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "HMACCOUNT",
"value": "8C7AB8DF0F9CE07B",
"domain": ".aiqicha.baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "in_source",
"value": "",
"domain": ".baidu.com",
"path": "/",
"expires": 1758478844,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "log_first_time",
"value": "1758450044117",
"domain": ".baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "ab_jid",
"value": "33e869a6776006b9b15226621d9e962a787d",
"domain": ".miao.baidu.com",
"path": "/",
"expires": 1793010068.841011,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "ab_jid_BFESS",
"value": "33e869a6776006b9b15226621d9e962a787d",
"domain": ".miao.baidu.com",
"path": "/",
"expires": 1793010068.841335,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "_j47_ka8_",
"value": "57",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1761042068,
"httpOnly": false,
"secure": true,
"sameSite": "Lax"
},
{
"name": "HOSUPPORT",
"value": "1",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010060.19105,
"httpOnly": true,
"secure": false,
"sameSite": "Lax"
},
{
"name": "HOSUPPORT_BFESS",
"value": "1",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010060.19133,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "pplogid",
"value": "5005QrhvcIwxvnTtxI9IW%2BSTbu%2FDtMhDFFF7VXO73YzDaXhejFyXhoTmF6%2FoxfHfWc%2FGKNog4vZmg3d%2BeyTMkdHx4n6KZhjPaYKka%2BpG1YSsq%2FU9RBKodLkTX%2BM%2BtFgb%2FtYE",
"domain": "passport.baidu.com",
"path": "/",
"expires": 1758709260.321525,
"httpOnly": true,
"secure": false,
"sameSite": "Lax"
},
{
"name": "ppfuid",
"value": "FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGkwDXhiatZU134EstSwAukYGEimjy3MrXEpSuItnI4KDz1YaR1IZypH7jXKWgQGdLUIbqunalLqi9QYrPILq+HRhUTk7XtVmzOiegF/IthbqhJsVwXkGdF24AsEQ3K5XBbh9EHAWDOg2T1ejpq0s2eFy9ar/j566XqWDobGoNNfmfpaEhZpob9le2b5QIEdiQez0E9SVndeXkd9EampG0PcXhLZ126CPFCIEuj/nWa+ROvbdS8t8Eyv8n0vz4h5jrDOXSxFWocn8LvXoXRLp3fob7NhjjiRXUXX5X+E0Ih4/0pXKZoFVHNbMJ2R4OK+61HHZFO3SaB4GS7zlBrG2cLm8lTRl19JYcYcqvy3P/50mxpWDwUUC4pvKOF9e+pwNq7l6HzKEZyCMUDd+W6AiaksYiu+4AAz72OnMQfgAyNUbW3IyzL5c+UBht87WUigOY9alcIuR+n1gwn+Dmf3unATYGtv0zKmAog3Ny9wFYiQ/gdKSrR9D25HSwrLQyIe5QKTkKSlY6nVev8MhaT3AUPwNqYIvWCQZXWkhuuU0ZXLMYAKJSeHY7mTrwwSSKC3ZaI5cl3oyGpSAbUwfZCJlI7vmML9W1bEYcJg2TDoeA58Au0Sh/BHYP7IeRGWgQ9aT0R91rSPWb2eeCt263/A+EJVR/A8+3BQ92SIDoXabq8Wb8ZGN9BAsC9g5OdjE6lhwzTadptHqT7mZN901gDzA4lMYEG/kekC+0J5/N5yVy+ei7UKhQHejRjxCO2+98Bn9ob54NQNdAIeQiKh/G4Ess0R11eH32ky9E0mZirll/qIXNd7jPkj5hbBFquQKM4S+tDJ34jmplOTrqqKT7PPVfrdgd4OkK13pEy86BsJ8M0gKXgtivUgM8Bjl1m/pkg0SuDyntWLdrmMxcZYvgySvSSwQ2Qtm8EkKHIMyR/XgfHnpX5vadGpRMro2qaE8u+x8w1gJHIRKib2u6Q1JtQiZE1Rde/vRx8xKfg6uYR37n0BvfgJE5+KbeuwCyAvJRGUA2fpt0VClIfV0m2PRG7bvH00OODKY6cFi7NgWAK6Jc1G4QXB1yDvA2w8ak+yOoVsy/cIBF37aBhyiPWPAOeYXBqA",
"domain": ".baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": true,
"sameSite": "Lax"
},
{
"name": "pplogid_BFESS",
"value": "5005QrhvcIwxvnTtxI9IW%2BSTbu%2FDtMhDFFF7VXO73YzDaXhejFyXhoTmF6%2FoxfHfWc%2FGKNog4vZmg3d%2BeyTMkdHx4n6KZhjPaYKka%2BpG1YSsq%2FU9RBKodLkTX%2BM%2BtFgb%2FtYE",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1758709261.178857,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "XFI",
"value": "acbbd730-96d4-11f0-abaf-f5c081f47fae",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "XFCS",
"value": "F8A576D0646C4B7C0E67CAA804F8AA617DC4022AE0AF73420705F4533C0CA40B",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "UBI",
"value": "fi_PncwhpxZ%7ETaJc-0Ex5oMyh1%7EDGnfkiIU",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.176502,
"httpOnly": true,
"secure": false,
"sameSite": "Lax"
},
{
"name": "STOKEN",
"value": "69c091a9b764753279341444f4026ba6069812bf241650f06e762972a5ac94e9",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.176577,
"httpOnly": true,
"secure": true,
"sameSite": "Lax"
},
{
"name": "BDUSS",
"value": "FPZ29QenBnTVZ3TzRPaU9RSkFyMVIybnlDNVQ2ZXBpfmNDTG9uSGJ-dVNYdmRvSVFBQUFBJCQAAAAAAQAAAAEAAACisPpgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJLRz2iS0c9oY3",
"domain": ".baidu.com",
"path": "/",
"expires": 1793010066.17662,
"httpOnly": true,
"secure": false,
"sameSite": "Lax"
},
{
"name": "PTOKEN",
"value": "eb75cc45bb642db7917ed3e1d8bac1bf",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.176674,
"httpOnly": true,
"secure": true,
"sameSite": "Lax"
},
{
"name": "BDUSS_BFESS",
"value": "FPZ29QenBnTVZ3TzRPaU9RSkFyMVIybnlDNVQ2ZXBpfmNDTG9uSGJ-dVNYdmRvSVFBQUFBJCQAAAAAAQAAAAEAAACisPpgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJLRz2iS0c9oY3",
"domain": ".baidu.com",
"path": "/",
"expires": 1793010066.176724,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "STOKEN_BFESS",
"value": "69c091a9b764753279341444f4026ba6069812bf241650f06e762972a5ac94e9",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.176765,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "PTOKEN_BFESS",
"value": "eb75cc45bb642db7917ed3e1d8bac1bf",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.176804,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "UBI_BFESS",
"value": "fi_PncwhpxZ%7ETaJc-0Ex5oMyh1%7EDGnfkiIU",
"domain": ".passport.baidu.com",
"path": "/",
"expires": 1793010066.17684,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "BDPPN",
"value": "6901ba7ef94483ef6b2d1dc2a4db3460",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1793010067.459182,
"httpOnly": true,
"secure": true,
"sameSite": "Lax"
},
{
"name": "login_type",
"value": "passport",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1761042067.45968,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "_t4z_qc8_",
"value": "xlTM-TogKuTw4I-uitJRzYiRi62JbSlawwmd",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1793010067.459897,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "ab175844880",
"value": "994c5d3e76cd91fe447202b5472f304317584500676f5",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1758453667.695524,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "Hm_lpvt_ad52b306e1ae4557f5d3534cce8f8bbf",
"value": "1758450068",
"domain": ".aiqicha.baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "ab_bid",
"value": "a6776006b9b15226621d9e962a787db165e6",
"domain": ".miao.baidu.com",
"path": "/",
"expires": 1793010068.841152,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "ab_sr",
"value": "1.0.1_OWQ1OTdkYmMwNDM5MmNkNTJlZGQ4ZWIwOTFmNzU0OGM4MjE0YTMyZTliMzJmZGVlNmJjNjYyZmM2NDgxMWRmZGJmZTM2NGMzMGIyNWU4MGNkOTliNWUxMGMxZDNiMjYyMGM4YzU5Y2Q1ZjRiYjA3YjVhOTk5MmQxNzA0ZmRjNzRkOTA4NzExYzI0OTdhNTI5NDZlNWEzNzY4MTdkOTZlNw==",
"domain": ".baidu.com",
"path": "/",
"expires": 1758457268.841241,
"httpOnly": true,
"secure": true,
"sameSite": "None"
},
{
"name": "_s53_d91_",
"value": "999095d23512de0d54b2a616fed7706a6ddb9f5be10a393f0ded8660ff845171193d32f34a4da6aead9a56f13674d5e208b08a4659da6d0aac1344befe8162e2f26141e837b5b3dd11731592823a96725fe0c235f64dabbfb5897987d6b423c39bd8bbd2e594d8077b851ad8d70f91b7b2cd299c56b03ae5b5167e21a7866739d9edcdaae2a3787821d83065550e61a10925c00154c45c7fa4eb8f5e61f0a7c013c6c7c371c3956a64cd117e322a1c6ecf8657476fbe9b5633a5b740bae757d01794ef1464667aef974f4f78d4ca11b4",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1761042068,
"httpOnly": false,
"secure": true,
"sameSite": "Lax"
},
{
"name": "_y18_s21_",
"value": "18dc1479",
"domain": "aiqicha.baidu.com",
"path": "/",
"expires": 1761042068,
"httpOnly": false,
"secure": true,
"sameSite": "Lax"
},
{
"name": "log_last_time",
"value": "1758450069501",
"domain": ".baidu.com",
"path": "/",
"expires": -1,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "RT",
"value": "\"z=1&dm=baidu.com&si=47fa02b3-b891-470b-8a4e-5b56ee2c4a8f&ss=mftjr0mr&sl=4&tt=jjm&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=req\"",
"domain": ".baidu.com",
"path": "/",
"expires": 1759054873,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
}
]

1
company/auth_state.json Normal file
View File

@@ -0,0 +1 @@
{}

1
company/qcc_cookies.json Normal file
View File

@@ -0,0 +1 @@
[]

86
company/qcc_crawler.py Normal file
View File

@@ -0,0 +1,86 @@
from playwright.sync_api import sync_playwright
import json
import os
import time
COOKIE_FILE = "qcc_cookies.json"
def ensure_cookie_file():
if not os.path.exists(COOKIE_FILE):
with open(COOKIE_FILE, 'w') as f:
json.dump([], f)
print(f"已创建新的cookie文件: {COOKIE_FILE}")
def save_cookies(context):
cookies = context.cookies()
with open(COOKIE_FILE, 'w') as f:
json.dump(cookies, f, indent=2)
print(f"Cookies已保存到 {COOKIE_FILE}")
def load_cookies(context):
try:
with open(COOKIE_FILE, 'r') as f:
cookies = json.load(f)
if cookies:
context.add_cookies(cookies)
return True
return False
except Exception as e:
print(f"加载cookies失败: {str(e)}")
return False
def qcc_login(page):
page.goto("https://www.qcc.com")
page.wait_for_selector(".login-container", timeout=5000)
page.click("text=扫码登录")
print("请扫描页面二维码登录...")
page.wait_for_url("**/usercenter**", timeout=120000)
print("登录成功!")
def search_company(page, company_name):
page.goto(f"https://www.qcc.com/web/search?key={company_name}")
page.wait_for_selector(".search-result-item", timeout=10000)
# 示例数据提取
company_info = {
"name": page.locator(".company-name").first.inner_text(),
"legal_rep": page.locator(".legal-person").first.inner_text(),
"status": page.locator(".company-status").first.inner_text()
}
return company_info
def main():
ensure_cookie_file()
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
if not load_cookies(context):
qcc_login(page)
save_cookies(context)
companies = ["阿里巴巴", "腾讯科技", "华为技术"]
for company in companies:
try:
info = search_company(page, company)
print(f"{info['name']} | 法人:{info['legal_rep']} | 状态:{info['status']}")
save_cookies(context) # 每次操作后更新cookies
time.sleep(3)
except Exception as e:
print(f"查询 {company} 失败: {str(e)}")
context.close()
browser.close()
if __name__ == "__main__":
main()

2
cookies/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
*.json

View File

@@ -294,7 +294,7 @@ def check_company_exists(company_names, type_list):
if __name__ == '__main__':
reader = CSVReader('data.csv')
reader = CSVReader('data/data.csv')
company_names = reader.read_column(0, has_header=False)
print("所有数据:", company_names)
@@ -307,7 +307,7 @@ if __name__ == '__main__':
with BingSearcher() as searcher:
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='company_search_bing_data.csv',
csv_file_name='data/company_search_bing_data.csv',
headers=['title', 'url', 'web_site_type', 'request_url', 'company_name', 'create_time']
)
# 查询所有数据
@@ -346,7 +346,7 @@ if __name__ == '__main__':
filter_list_with_not_match = filter_aiqicha_qcc(data_list, company_name, with_not_match=True)
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='company_search_filter_is_none_data.csv',
csv_file_name='data/company_search_filter_is_none_data.csv',
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
)
@@ -357,7 +357,7 @@ if __name__ == '__main__':
else:
# 创建CSV工具实例
csv_tool = CSVTool(
csv_file_name='company_search_bing_data.csv',
csv_file_name='data/company_search_bing_data.csv',
headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time']
)
@@ -373,8 +373,8 @@ if __name__ == '__main__':
# results2 = searcher.search("腾讯", 1)
# results3 = searcher.search("百度", 1)
sleep_time = 5
sleep_time += random.randint(3, 10)
sleep_time = 3
sleep_time += random.randint(1, 2)
time.sleep(sleep_time)
pass
pass

View File

@@ -15,6 +15,24 @@ def query_init_company_data(csv_file_name):
print("所有数据:", all_data)
return all_data
def parse_operating_period(period_str):
"""
解析营业期限字符串,返回开始日期和结束日期
示例输入:"2020-01-01至2030-12-31""2020-01-01起长期"
"""
if not period_str:
return '', ''
if '' in period_str:
parts = period_str.split('')
return str(parts[0].strip()), str(parts[1].strip())
elif '' in period_str:
parts = period_str.split('')
return str(parts[0].strip()), '长期'
else:
return str(period_str.strip()), ''
def crawl_and_save_aiqicha_details(input_csv, output_csv):
"""
从CSV文件中读取爱企查URL爬取企业详情并保存到新的CSV文件中
@@ -41,7 +59,22 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
'address', # 公司地址
'business_scope', # 经营范围
'source_url', # 原始URL
'create_time' # 创建时间
# 新增字段
'company_type', # 企业类型
'industry', # 所属行业
'registration_authority', # 登记机关
'operating_period', # 营业期限
'operating_start_date', # 营业期限开始日期
'operating_end_date', # 营业期限结束日期
'actual_capital', # 实缴资本
'taxpayer_id', # 纳税人识别号
'organization_code', # 组织机构代码
'approved_date', # 核准日期
'staff_size', # 参保人数
'phone', # 电话,
'create_time',
# 创建时间
]
# 创建输出CSV工具实例
@@ -49,7 +82,34 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
csv_file_name=output_csv,
headers=output_headers
)
# 读取已有的企业名称,用于去重
existing_company_names = set()
try:
existing_data = output_csv_tool.get_all_data()
existing_company_names = {item['company_name'] for item in existing_data if item['company_name']}
print(f"已存在 {len(existing_company_names)} 条企业数据")
except FileNotFoundError:
print(f"输出文件 {output_csv} 不存在,将创建新文件")
except Exception as e:
print(f"读取已有数据时出错: {e}")
# 过滤掉已存在的企业数据
filtered_aiqicha_data = []
for item in aiqicha_data:
company_name = item.get('company_name', '')
if company_name and company_name in existing_company_names:
print(f"跳过已存在的企业: {company_name}")
else:
filtered_aiqicha_data.append(item)
aiqicha_data = filtered_aiqicha_data
print(f'过滤后剩余 {len(aiqicha_data)} 条爱企查数据待处理')
if len(aiqicha_data) <= 0:
print("没有待处理的爱企查数据,退出")
return
# 使用爱企查详情爬虫
with AiqichaDetailCrawler() as crawler:
company_details = []
@@ -65,6 +125,11 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
if detail:
# 添加来源URL和公司名称
# 解析营业期限字段
operating_period = detail.get('operating_period', '')
start_date, end_date = parse_operating_period(operating_period)
detail['source_url'] = url
# 转换字段名以匹配CSV表头
converted_item = {
@@ -76,7 +141,20 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
'business_status': detail.get('business_status', ''),
'address': detail.get('address', ''),
'business_scope': detail.get('business_scope', ''),
'source_url': detail.get('source_url', '')
'source_url': detail.get('source_url', ''),
# 新增字段映射
'company_type': detail.get('company_type', ''),
'industry': detail.get('industry', ''),
'registration_authority': detail.get('registration_authority', ''),
'operating_period': detail.get('operating_period', ''),
'operating_start_date': str(start_date),
'operating_end_date': str(end_date),
'actual_capital': detail.get('actual_capital', ''),
'taxpayer_id': detail.get('taxpayer_id', ''),
'organization_code': detail.get('organization_code', ''),
'approved_date': detail.get('approved_date', ''),
'staff_size': detail.get('staff_size', ''),
'phone': detail.get('phone', '')
}
# 立即保存每条数据,避免数据丢失
@@ -97,14 +175,15 @@ def crawl_and_save_aiqicha_details(input_csv, output_csv):
# 添加延迟,避免请求过快
time.sleep(2)
next_sleep_interval = random.uniform(5, 15)
next_sleep_interval = random.uniform(3, 15)
time.sleep(next_sleep_interval)
print(f"总共成功处理并保存了 {success_count} 条企业详情数据到 {output_csv}")
# crawler.browser.close_browser()
if __name__ == '__main__':
# 从原始搜索结果CSV中读取爱企查URL爬取详情并保存到新CSV文件
crawl_and_save_aiqicha_details('company_search_bing_data.csv', 'aiqicha_company_details.csv')
crawl_and_save_aiqicha_details('data/company_search_bing_data.csv', 'data/aiqicha_company_details.csv')
# 原有代码保留
# all_data = query_init_company_data('company_search_bing_data.csv')

2
data/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
data.csv

View File

@@ -0,0 +1,47 @@
杭州元氪科技有限公司
杭州华立创客社区管理有限公司
杭州迅能贸易有限公司
杭州栖梦工坊科技服务有限公司
瑞幸咖啡(杭州)有限公司
杭州余杭区五常街道莱欧烘焙店
杭州余杭融昭文化创意工作室(个体工商户)
杭州余杭区五常街道梓鑫水果店
杭州三乐进出口有限公司
杭州余杭区五常街道慧欣图文设计工作室
杭州质享电子商务有限责任公司
杰茜荟乳业(上海)有限公司杭州余杭科技分公司
浙江正播影视文化传媒有限公司
杭州品思电子商务有限公司
杭州芯桥智联科技有限公司
杭州帛蔻进出口有限公司
杭州佰磊斯科技有限公司
杭州括号传媒有限公司
杭州赤骥贸易有限公司
杭州键嘉医疗科技股份有限公司
阳光财产保险股份有限公司杭州中心支公司
杭州智风科技有限公司
杭州勇达检测技术有限公司
杭州初米网络技术有限公司
杭州和辰电力科技有限公司
杭州润州光电技术有限公司
杭州君莱通信科技有限公司
杭州余杭图王广告设计工作室(个体工商户)
杭州润光软件技术有限公司
杭州易光科技有限公司
杭州奇课文化传媒有限公司
杭州柯拉科技有限公司
杭州备胎说车科技有限公司
浙江绿色共享教育基金会
杭州爱维因健康科技有限公司
浙江润影医疗科技有限公司
杭州花蜂科技有限公司
连云港金康和信药业有限公司
杭州立镖信息科技有限公司
杭州番石榴供应链管理有限公司
杭州昇辉生物技术有限公司
湖州益荣服饰有限公司
杭州顾嘉网络科技有限公司
厨何以(杭州)电子商务有限公司
杭州优冠商贸有限公司
永馨智慧科技(杭州)有限公司
杭州钱唐隆腾医疗技术有限公司
1 杭州元氪科技有限公司
2 杭州华立创客社区管理有限公司
3 杭州迅能贸易有限公司
4 杭州栖梦工坊科技服务有限公司
5 瑞幸咖啡(杭州)有限公司
6 杭州余杭区五常街道莱欧烘焙店
7 杭州余杭融昭文化创意工作室(个体工商户)
8 杭州余杭区五常街道梓鑫水果店
9 杭州三乐进出口有限公司
10 杭州余杭区五常街道慧欣图文设计工作室
11 杭州质享电子商务有限责任公司
12 杰茜荟乳业(上海)有限公司杭州余杭科技分公司
13 浙江正播影视文化传媒有限公司
14 杭州品思电子商务有限公司
15 杭州芯桥智联科技有限公司
16 杭州帛蔻进出口有限公司
17 杭州佰磊斯科技有限公司
18 杭州括号传媒有限公司
19 杭州赤骥贸易有限公司
20 杭州键嘉医疗科技股份有限公司
21 阳光财产保险股份有限公司杭州中心支公司
22 杭州智风科技有限公司
23 杭州勇达检测技术有限公司
24 杭州初米网络技术有限公司
25 杭州和辰电力科技有限公司
26 杭州润州光电技术有限公司
27 杭州君莱通信科技有限公司
28 杭州余杭图王广告设计工作室(个体工商户)
29 杭州润光软件技术有限公司
30 杭州易光科技有限公司
31 杭州奇课文化传媒有限公司
32 杭州柯拉科技有限公司
33 杭州备胎说车科技有限公司
34 浙江绿色共享教育基金会
35 杭州爱维因健康科技有限公司
36 浙江润影医疗科技有限公司
37 杭州花蜂科技有限公司
38 连云港金康和信药业有限公司
39 杭州立镖信息科技有限公司
40 杭州番石榴供应链管理有限公司
41 杭州昇辉生物技术有限公司
42 湖州益荣服饰有限公司
43 杭州顾嘉网络科技有限公司
44 厨何以(杭州)电子商务有限公司
45 杭州优冠商贸有限公司
46 永馨智慧科技(杭州)有限公司
47 杭州钱唐隆腾医疗技术有限公司

View File

@@ -1,7 +1,7 @@
杭州辉煌物业管理有限公司
杭州辉望科技有限公司
浙江八方电信科技集团有限公司
中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅)
中国移动通信集团浙江有限公司余杭分公司西溪八方城
瑞凤九天(杭州)科技有限公司
金码智能科技(杭州)有限公司
OPPO广东移动通信有限公司
@@ -15,7 +15,7 @@ OPPO广东移动通信有限公司
杭州钱橙似锦科技有限公司
杭州奥得徕贸易有限公司
杭州伍壹荟旅游咨资询有限公司
杭州心满意供应链服务有限公司
杭州心满意供应链服务有限公司
杭州麒晨科技有限公司
杭州羊咩咩文化传媒有限公司
杭州禾露则正生物科技有限公司
@@ -30,7 +30,7 @@ OPPO广东移动通信有限公司
杭州裕阳经营管理合伙企业(有限合伙)
氧气.康复中心
杭州云迹物联科技有限公司
杭州着墨文化创意限公司
杭州着墨文化创意限公司
亚信科技(南京)有限公司
杭州密尔沃智能装备有限公司
杭州骏远电子商务有限公司
@@ -41,7 +41,7 @@ OPPO广东移动通信有限公司
杭州琑为缘文化艺术有限公司
浙江丝里伯睡眠科技股份有限公司
杭州倍驰科技有限公司
杭州心灵部落教育有限公司(灵动生活)
杭州心灵部落教育有限公司
杭州云印智造科技有限公司
浙江海拓环境技术有限公司
申能环境科技有限公司
@@ -50,7 +50,7 @@ OPPO广东移动通信有限公司
杭州字节跳动科技有限公司
杭州邻汇网络科技有限公司
浙江建盛安全科技有限公司
幻想集团·杭州运营中心
石家庄幻想企业管理咨询有限公司
杭州阿克莱斯设备有限公司
浙江省现代农业促进会
益思芯科技(杭州)有限公司
1 杭州辉煌物业管理有限公司
2 杭州辉望科技有限公司
3 浙江八方电信科技集团有限公司
4 中国移动通信集团浙江有限公司余杭分公司西溪八方城(自营厅) 中国移动通信集团浙江有限公司余杭分公司西溪八方城
5 瑞凤九天(杭州)科技有限公司
6 金码智能科技(杭州)有限公司
7 OPPO广东移动通信有限公司
15 杭州钱橙似锦科技有限公司
16 杭州奥得徕贸易有限公司
17 杭州伍壹荟旅游咨资询有限公司
18 杭州心满意定供应链服务有限公司 杭州心满意足供应链服务有限公司
19 杭州麒晨科技有限公司
20 杭州羊咩咩文化传媒有限公司
21 杭州禾露则正生物科技有限公司
30 杭州裕阳经营管理合伙企业(有限合伙)
31 氧气.康复中心
32 杭州云迹物联科技有限公司
33 杭州着墨文化创意者限公司 杭州着墨文化创意有限公司
34 亚信科技(南京)有限公司
35 杭州密尔沃智能装备有限公司
36 杭州骏远电子商务有限公司
41 杭州琑为缘文化艺术有限公司
42 浙江丝里伯睡眠科技股份有限公司
43 杭州倍驰科技有限公司
44 杭州心灵部落教育有限公司(灵动生活) 杭州心灵部落教育有限公司
45 杭州云印智造科技有限公司
46 浙江海拓环境技术有限公司
47 申能环境科技有限公司
50 杭州字节跳动科技有限公司
51 杭州邻汇网络科技有限公司
52 浙江建盛安全科技有限公司
53 幻想集团·杭州运营中心 石家庄幻想企业管理咨询有限公司
54 杭州阿克莱斯设备有限公司
55 浙江省现代农业促进会
56 益思芯科技(杭州)有限公司

File diff suppressed because one or more lines are too long

11925
demo/aiqicha-datail.html Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1262
demo/all_body.html Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 466 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

831
demo/qichacha_aiqicha.js Normal file
View File

@@ -0,0 +1,831 @@
// ==UserScript==
// @name 爱企查&企查查等
// @namespace http://tampermonkey.net/
// @version 0.2
// @description 在页面右下角添加工具按钮,支持复制源码和解析公司信息
// @author You
// @match https://www.qcc.com/firm/*
// @match https://aiqicha.baidu.com/company_detail_*
// @grant none
// ==/UserScript==
(function () {
"use strict";
// 工具类 - 存放通用函数
class ToolUtils {
static cleanText(text) {
return text
.replace(/\s+/g, " ")
.replace(/[\r\n\t]/g, "")
.trim();
}
static extractText(doc, selectors) {
for (const selector of selectors) {
const element = doc.querySelector(selector);
if (element && element.textContent.trim()) {
return element.textContent.trim();
}
}
return "";
}
static copyToClipboard(content, successMessage) {
const textarea = document.createElement("textarea");
textarea.value = content;
textarea.style.position = "fixed";
textarea.style.top = "0";
textarea.style.left = "0";
textarea.style.width = "1px";
textarea.style.height = "1px";
textarea.style.opacity = "0";
document.body.appendChild(textarea);
textarea.select();
document.execCommand("copy");
document.body.removeChild(textarea);
if (successMessage) {
this.showAutoCloseMessage(successMessage, "success");
}
}
static showAutoCloseMessage(message, type = "info") {
// 创建一个自动关闭的提示框替代 alert
const alertBox = document.createElement("div");
alertBox.textContent = message;
alertBox.style.position = "fixed";
alertBox.style.top = "50%";
alertBox.style.left = "50%";
alertBox.style.transform = "translate(-50%, -50%)";
// 根据消息类型设置不同颜色
if (type === "success") {
alertBox.style.backgroundColor = "#52c41a";
} else if (type === "error") {
alertBox.style.backgroundColor = "#f5222d";
} else {
alertBox.style.backgroundColor = "#1890ff";
}
alertBox.style.color = "white";
alertBox.style.padding = "10px 20px";
alertBox.style.borderRadius = "4px";
alertBox.style.zIndex = "10001";
alertBox.style.boxShadow = "0 2px 8px rgba(0,0,0,0.15)";
alertBox.style.transition = "opacity 0.3s";
document.body.appendChild(alertBox);
// 2秒后自动关闭
setTimeout(() => {
if (document.body.contains(alertBox)) {
// 添加淡出效果
alertBox.style.opacity = "0";
// 真正移除元素
setTimeout(() => {
if (document.body.contains(alertBox)) {
document.body.removeChild(alertBox);
}
}, 300);
}
}, 2000);
}
static showResult(data) {
const modal = document.createElement("div");
modal.style.position = "fixed";
modal.style.top = "50%";
modal.style.left = "50%";
modal.style.transform = "translate(-50%, -50%)";
modal.style.width = "600px";
modal.style.maxHeight = "80vh";
modal.style.overflowY = "auto";
modal.style.backgroundColor = "white";
modal.style.padding = "20px";
modal.style.boxShadow = "0 0 10px rgba(0,0,0,0.3)";
modal.style.zIndex = "10000";
const pre = document.createElement("pre");
pre.textContent = JSON.stringify(data, null, 2);
pre.style.whiteSpace = "pre-wrap";
pre.style.wordWrap = "break-word";
const copyBtn = document.createElement("button");
copyBtn.textContent = "复制JSON";
copyBtn.style.marginTop = "10px";
copyBtn.style.padding = "8px 16px";
copyBtn.style.backgroundColor = "#52c41a";
copyBtn.style.color = "white";
copyBtn.style.border = "none";
copyBtn.style.borderRadius = "4px";
copyBtn.style.cursor = "pointer";
copyBtn.addEventListener("click", () => {
navigator.clipboard
.writeText(JSON.stringify(data, null, 2))
.then(() => this.showAutoCloseMessage("已复制到剪贴板", "success"))
.catch((err) => alert("复制失败: " + err));
});
const closeBtn = document.createElement("button");
closeBtn.textContent = "关闭";
closeBtn.style.marginLeft = "10px";
closeBtn.style.marginTop = "10px";
closeBtn.style.padding = "8px 16px";
closeBtn.style.backgroundColor = "#f5222d";
closeBtn.style.color = "white";
closeBtn.style.border = "none";
closeBtn.style.borderRadius = "4px";
closeBtn.style.cursor = "pointer";
closeBtn.addEventListener("click", () => {
document.body.removeChild(modal);
});
modal.innerHTML = '<h2 style="margin-top: 0;">企业信息解析结果</h2>';
modal.appendChild(pre);
modal.appendChild(document.createElement("br"));
modal.appendChild(copyBtn);
modal.appendChild(closeBtn);
document.body.appendChild(modal);
// // Automatically close the modal after 2 seconds
// setTimeout(() => {
// if (document.body.contains(modal)) {
// document.body.removeChild(modal);
// }
// }, 2000);
}
}
// 爱企查解析类
class AiQiChaParser {
getPhoneNumber() {
// 查找电话信息容器
const phoneContainer = document.querySelector(
"div.business-info div.telphone-lists-wrap"
);
if (!phoneContainer) return "未找到电话信息";
// 查找包含电话号码的元素
const phoneElement = phoneContainer.querySelector("span.copy-box span");
if (!phoneElement) return "未找到电话号码";
return ToolUtils.cleanText(phoneElement.textContent);
}
constructor() {
this.table = null;
}
// 初始化表格
initTable() {
this.table = document.querySelector("table.zx-detail-basic-table");
if (!this.table) {
alert("未找到企业信息表格");
return false;
}
return true;
}
// 获取优化后的值
getOptimizedValue(title) {
const cells = Array.from(this.table.querySelectorAll("td"));
const titleCell = cells.find(
(cell) => ToolUtils.cleanText(cell.textContent) === title
);
if (!titleCell) return null;
let valueCell = titleCell.nextElementSibling;
if (!valueCell) return null;
const valueElement =
valueCell.querySelector(".enter-bg-ele") ||
valueCell.querySelector(".addr-enter-bg-ele") ||
valueCell;
return ToolUtils.cleanText(valueElement.textContent);
}
// 获取法定代表人
getLegalRepresentative() {
const legalElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "法定代表人");
if (legalElements.length > 0) {
const valueCell = legalElements[0].nextElementSibling;
if (valueCell && valueCell.classList.contains("image-text-content")) {
const nameElement = valueCell.querySelector(".person-name-warp a");
if (nameElement) {
return ToolUtils.cleanText(nameElement.textContent);
}
return ToolUtils.cleanText(valueCell.textContent);
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => td.textContent.includes("法定代表人"));
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
return ToolUtils.cleanText(valueCell.textContent);
}
return null;
}
// 获取统一社会信用代码
getUnifiedSocialCreditCode() {
const codeElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => {
return (
td.textContent.includes("统一社会信用代码") &&
td.nextElementSibling &&
td.nextElementSibling.classList.contains("table-regCapital-lable")
);
}
);
if (codeElements.length > 0) {
const valueCell = codeElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => td.textContent.includes("纳税人识别号")
);
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
const valueCell = taxElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取工商注册号
getBusinessRegistrationNo() {
const regElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => ToolUtils.cleanText(td.textContent).includes("工商注册号")
);
if (regElements.length > 0 && regElements[0].nextElementSibling) {
const valueCell = regElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取组织机构代码
getOrganizationCode() {
const orgCodeElements = Array.from(
this.table.querySelectorAll(".poptip-wrap-org-no")
).filter((el) => el.textContent.includes("组织机构代码"));
if (orgCodeElements.length > 0) {
const valueCell = orgCodeElements[0].closest("td").nextElementSibling;
if (valueCell && valueCell.classList.contains("enter-bg")) {
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "组织机构代码");
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取纳税人识别号
getTaxpayerId() {
const taxElements = Array.from(this.table.querySelectorAll("td")).filter(
(td) => ToolUtils.cleanText(td.textContent).includes("纳税人识别号")
);
if (taxElements.length > 0 && taxElements[0].nextElementSibling) {
const valueCell = taxElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
const creditElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) =>
ToolUtils.cleanText(td.textContent).includes("统一社会信用代码")
);
if (creditElements.length > 0 && creditElements[0].nextElementSibling) {
const valueCell = creditElements[0].nextElementSibling;
const rawValue =
valueCell.querySelector(".enter-bg-ele")?.textContent ||
valueCell.textContent;
return ToolUtils.cleanText(rawValue);
}
return null;
}
// 获取参保人数
getInsuranceNumber() {
const insuranceElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => {
return (
td.textContent.includes("参保人数") &&
td.querySelector(".insurance-info")
);
});
if (insuranceElements.length > 0) {
const valueCell = insuranceElements[0].nextElementSibling;
if (!valueCell) return null;
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
const match = rawText.match(/(\d+人)/);
return match ? match[0] : null;
}
const registrationElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => td.textContent.includes("登记机关"));
if (
registrationElements.length > 0 &&
registrationElements[0].previousElementSibling
) {
const valueCell = registrationElements[0].previousElementSibling;
const rawText = valueCell.textContent.replace(/[\r\n\t]/g, "").trim();
const match = rawText.match(/(\d+人)/);
return match ? match[0] : null;
}
return null;
}
// 获取核准日期
getApprovalDate() {
const approvalElements = Array.from(
this.table.querySelectorAll(".poptip-wrap-annual-date")
).filter((el) => el.textContent.includes("核准日期"));
if (approvalElements.length > 0) {
const valueCell = approvalElements[0].closest("td").nextElementSibling;
if (valueCell) {
const rawValue = valueCell.textContent
.replace(/[\r\n\t]/g, "")
.trim();
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
return rawValue;
}
}
}
const titleElements = Array.from(
this.table.querySelectorAll("td")
).filter((td) => ToolUtils.cleanText(td.textContent) === "核准日期");
if (titleElements.length > 0 && titleElements[0].nextElementSibling) {
const valueCell = titleElements[0].nextElementSibling;
const rawValue = ToolUtils.cleanText(valueCell.textContent);
if (/^\d{4}-\d{2}-\d{2}$/.test(rawValue)) {
return rawValue;
}
}
return null;
}
// 解析公司信息主方法
parseCompanyInfo() {
if (!this.initTable()) return;
const companyData = {
企业名称: this.getOptimizedValue("企业名称"),
统一社会信用代码: this.getUnifiedSocialCreditCode(),
法定代表人: this.getLegalRepresentative(),
电话: this.getPhoneNumber(),
经营状态: this.getOptimizedValue("经营状态"),
成立日期: this.getOptimizedValue("成立日期"),
行政区划: this.getOptimizedValue("行政区划"),
注册资本: this.getOptimizedValue("注册资本"),
实缴资本: this.getOptimizedValue("实缴资本"),
企业类型: this.getOptimizedValue("企业类型"),
所属行业: this.getOptimizedValue("所属行业"),
工商注册号: this.getBusinessRegistrationNo(),
组织机构代码: this.getOrganizationCode(),
纳税人识别号: this.getTaxpayerId(),
纳税人资质: this.getOptimizedValue("纳税人资质"),
营业期限: this.getOptimizedValue("营业期限"),
核准日期: this.getApprovalDate(),
参保人数: this.getInsuranceNumber(),
登记机关: this.getOptimizedValue("登记机关"),
曾用名: this.getOptimizedValue("曾用名"),
注册地址: this.getOptimizedValue("注册地址"),
经营范围: this.getOptimizedValue("经营范围"),
};
ToolUtils.showResult(companyData);
}
}
// QCC解析类 企查查
class QCCParser {
constructor() {
this.table = null;
}
// 初始化表格
initTable() {
const cominfoNormal = document.querySelector("div.cominfo-normal");
if (!cominfoNormal) {
alert("未找到企业信息容器");
return false;
}
this.table = cominfoNormal.querySelector("table.ntable");
if (!this.table) {
alert("未找到企业信息表格");
return false;
}
return true;
}
// 获取优化后的值
getOptimizedValue(title) {
const headerCells = Array.from(this.table.querySelectorAll("td.tb"));
let value = null;
headerCells.forEach((header) => {
if (ToolUtils.cleanText(header.textContent).includes(title)) {
const valueCell = header.nextElementSibling;
if (valueCell) {
// 尝试从copy-value类中获取值
const copyValue = valueCell.querySelector(".copy-value");
if (copyValue) {
value = ToolUtils.cleanText(copyValue.textContent);
} else {
value = ToolUtils.cleanText(valueCell.textContent);
}
}
}
});
return value;
}
// 获取法定代表人
getLegalRepresentative() {
// Try the existing method first
const basicValue =
this.getOptimizedValue("法定代表人") || this.getOptimizedValue("法人");
if (basicValue && basicValue.trim()) {
// Remove any "关联企业 X" text
return basicValue.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// If basic extraction fails, try more specific approach
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
(cell) => ToolUtils.cleanText(cell.textContent).includes("法定代表人")
);
if (!headerCell) return null;
const valueCell = headerCell.nextElementSibling;
if (!valueCell) return null;
// Try to find the name within the complex structure
// Look for anchor tags with target="_blank" which typically contain the legal representative's name
const nameLinks = valueCell.querySelectorAll('a[target="_blank"]');
for (const link of nameLinks) {
const name = ToolUtils.cleanText(link.textContent);
// Make sure it's not empty and doesn't contain obvious non-name text
if (name && !name.includes("关联企业") && !name.includes("复制")) {
return name;
}
}
// Alternative approach - look for the first anchor tag in the cell
const firstLink = valueCell.querySelector("a");
if (firstLink) {
const name = ToolUtils.cleanText(firstLink.textContent);
// Remove any trailing "关联企业 X" text
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// Fallback to general value extraction
const copyValue = valueCell.querySelector(".copy-value");
if (copyValue) {
const name = ToolUtils.cleanText(copyValue.textContent);
// Remove any trailing "关联企业 X" text
return name.replace(/\s*关联企业\s*\d+$/, "").trim();
}
const rawText = ToolUtils.cleanText(valueCell.textContent);
// Remove any trailing "关联企业 X" text
return rawText.replace(/\s*关联企业\s*\d+$/, "").trim();
}
// 获取统一社会信用代码
getUnifiedSocialCreditCode() {
return (
this.getOptimizedValue("统一社会信用代码") ||
this.getOptimizedValue("信用代码")
);
}
// 获取工商注册号
getBusinessRegistrationNo() {
return (
this.getOptimizedValue("工商注册号") || this.getOptimizedValue("注册号")
);
}
// 获取组织机构代码
getOrganizationCode() {
return this.getOptimizedValue("组织机构代码");
}
// 获取纳税人识别号
getTaxpayerId() {
return (
this.getOptimizedValue("纳税人识别号") ||
this.getUnifiedSocialCreditCode()
);
}
// 获取参保人数
getInsuranceNumber() {
// 查找参保人数表头
const headerCell = Array.from(this.table.querySelectorAll("td.tb")).find(
(cell) => ToolUtils.cleanText(cell.textContent).includes("参保人数")
);
if (!headerCell) return null;
const valueCell = headerCell.nextElementSibling;
if (!valueCell) return null;
// 提取参保人数数字
const numberSpan = valueCell.querySelector("span");
const number = numberSpan
? ToolUtils.cleanText(numberSpan.textContent)
: null;
// 提取年报年份
const reportLink = valueCell.querySelector("a.m-l-r-10");
const reportYear = reportLink
? ToolUtils.cleanText(reportLink.textContent)
: "";
// 组合结果
return number ? `${number}${reportYear}` : null;
}
// 获取联系电话
getPhoneNumber() {
// 查找联系信息容器
const contactInfo = document.querySelector("div.contact-info");
if (!contactInfo) return null;
// 查找右侧信息区域
const rightPart = contactInfo.querySelector("div.main-part-item.right");
if (!rightPart) return null;
// 查找包含电话的行
const rows = Array.from(rightPart.querySelectorAll("div.rline"));
const phoneRow = rows.find((row) =>
ToolUtils.cleanText(row.textContent).includes("电话:")
);
if (!phoneRow) return null;
// 提取电话号码
const spans = Array.from(
phoneRow.querySelectorAll("span.need-copy-field")
);
const phoneSpan = spans.find(
(span) => !ToolUtils.cleanText(span.textContent).includes("电话:")
);
return phoneSpan ? ToolUtils.cleanText(phoneSpan.textContent) : null;
}
// 获取核准日期
getApprovalDate() {
return (
this.getOptimizedValue("核准日期") || this.getOptimizedValue("成立日期")
);
}
// 解析公司信息主方法
parseCompanyInfo() {
if (!this.initTable()) return;
const companyData = {
企业名称:
this.getOptimizedValue("企业名称") ||
this.getOptimizedValue("公司名称"),
统一社会信用代码: this.getUnifiedSocialCreditCode(),
法定代表人: this.getLegalRepresentative(),
经营状态: this.getOptimizedValue("登记状态"),
成立日期: this.getOptimizedValue("成立日期"),
行政区划: this.getOptimizedValue("行政区划"),
注册资本: this.getOptimizedValue("注册资本"),
实缴资本: this.getOptimizedValue("实缴资本"),
企业类型: this.getOptimizedValue("企业类型"),
所属行业: this.getOptimizedValue("国标行业"),
工商注册号: this.getBusinessRegistrationNo(),
组织机构代码: this.getOrganizationCode(),
纳税人识别号: this.getTaxpayerId(),
纳税人资质: this.getOptimizedValue("纳税人资质"),
营业期限: this.getOptimizedValue("营业期限"),
核准日期: this.getApprovalDate(),
参保人数: this.getInsuranceNumber(),
电话: this.getPhoneNumber(),
登记机关: this.getOptimizedValue("登记机关"),
曾用名: this.getOptimizedValue("曾用名"),
注册地址: this.getOptimizedValue("注册地址"),
经营范围: this.getOptimizedValue("经营范围"),
};
ToolUtils.showResult(companyData);
}
}
// 创建按钮容器
function createButtonContainer() {
const container = document.createElement("div");
container.id = "tool-container";
Object.assign(container.style, {
position: "fixed",
right: "20px",
bottom: "20px",
zIndex: "9999",
display: "flex",
flexDirection: "column",
gap: "10px",
width: "40px",
height: "40px",
backgroundColor: "#4CAF50",
borderRadius: "50%",
transition: "all 0.3s ease",
overflow: "hidden",
cursor: "move",
});
// +号指示器
const plusSign = document.createElement("div");
plusSign.textContent = "+";
Object.assign(plusSign.style, {
color: "white",
fontSize: "24px",
textAlign: "center",
lineHeight: "40px",
width: "100%",
});
container.appendChild(plusSign);
// 悬停展开效果
container.addEventListener("mouseenter", () => {
container.style.width = "150px";
container.style.height = "auto";
container.style.borderRadius = "8px";
});
container.addEventListener("mouseleave", () => {
container.style.width = "40px";
container.style.height = "40px";
container.style.borderRadius = "50%";
});
// 添加拖动功能
let isDragging = false;
let offsetX, offsetY;
// 鼠标按下开始拖动
container.addEventListener("mousedown", (e) => {
// 只有点击+号区域才允许拖动
if (e.target === plusSign || e.target === container) {
isDragging = true;
const rect = container.getBoundingClientRect();
offsetX = e.clientX - rect.left;
offsetY = e.clientY - rect.top;
container.style.cursor = "grabbing";
// 阻止事件冒泡和默认行为
e.stopPropagation();
e.preventDefault();
}
});
// 鼠标移动时更新位置
document.addEventListener("mousemove", (e) => {
if (!isDragging) return;
container.style.left = e.clientX - offsetX + "px";
container.style.top = e.clientY - offsetY + "px";
container.style.right = "auto";
container.style.bottom = "auto";
});
// 鼠标释放结束拖动
document.addEventListener("mouseup", () => {
if (isDragging) {
isDragging = false;
container.style.cursor = "move";
}
});
// 创建功能按钮
function createButton(text, onClick) {
const button = document.createElement("button");
button.textContent = text;
Object.assign(button.style, {
padding: "8px 12px",
border: "none",
borderRadius: "4px",
backgroundColor: "white",
color: "#333",
cursor: "pointer",
width: "100%",
transition: "backgroundColor 0.2s",
});
button.addEventListener(
"mouseenter",
() => (button.style.backgroundColor = "#f0f0f0")
);
button.addEventListener(
"mouseleave",
() => (button.style.backgroundColor = "white")
);
button.addEventListener("click", onClick);
return button;
}
// 复制源码按钮
const copySourceButton = createButton("复制源码", () => {
const html = document.documentElement.outerHTML;
copyToClipboard(html, "HTML源码已复制到剪贴板");
/*
navigator.clipboard
.writeText(html)
.then(() => {
alert("源码已复制到剪贴板");
})
.catch((err) => {
console.error("复制失败:", err);
});
*/
});
// 解析公司信息按钮
const parseInfoButton = createButton("解析公司信息", () => {
// 根据当前URL选择对应的解析器
let parser;
if (window.location.host.includes("aiqicha.baidu.com")) {
parser = new AiQiChaParser();
} else if (window.location.host.includes("qcc.com")) {
parser = new QCCParser();
} else {
alert("不支持的网站");
return;
}
parser.parseCompanyInfo();
});
// 添加按钮到容器
container.appendChild(copySourceButton);
container.appendChild(parseInfoButton);
document.body.appendChild(container);
}
// 页面加载完成后创建按钮
window.addEventListener("load", createButtonContainer);
})();

View File

@@ -0,0 +1,24 @@
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False, # 通过 use_doc_orientation_classify 参数指定不使用文档方向分类模型
use_doc_unwarping=False, # 通过 use_doc_unwarping 参数指定不使用文本图像矫正模型
use_textline_orientation=False, # 通过 use_textline_orientation 参数指定不使用文本行方向分类模型
lang="ch",
#use_angle_cls=True,
)
# ocr = PaddleOCR(lang="en") # 通过 lang 参数来使用英文模型
# ocr = PaddleOCR(ocr_version="PP-OCRv4") # 通过 ocr_version 参数来使用 PP-OCR 其他版本
# ocr = PaddleOCR(device="gpu") # 通过 device 参数使得在模型推理时使用 GPU
# ocr = PaddleOCR(
# text_detection_model_name="PP-OCRv5_server_det",
# text_recognition_model_name="PP-OCRv5_server_rec",
# use_doc_orientation_classify=False,
# use_doc_unwarping=False,
# use_textline_orientation=False,
# ) # 更换 PP-OCRv5_server 模型
result = ocr.predict("D:/gitstudy/pythonwork/manchuspider/data/满洲语字典/满汉大辞典/images/1.png")
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")

View File

@@ -7,7 +7,7 @@ from tool.aiqicha_login import AiqichaLoginManager # 导入登录管理器
from tool.aiqicha_detail_parser import AiqichaDetailParser # 导入解析器
class AiqichaDetailCrawler:
def __init__(self, cookie_path="aiqicha_cookies.json"):
def __init__(self, cookie_path="cookies/aiqicha_cookies.json"):
self.browser = WebBrowser(cookie_path)
self.browser_started = False
self.login_manager = None # 添加登录管理器实例
@@ -27,12 +27,12 @@ class AiqichaDetailCrawler:
else:
print("已加载Cookie")
# 使用登录管理器检测登录状态
logined = self.login_manager.check_and_login()
if logined:
print("登录成功")
else:
print("登录失败")
# # 使用登录管理器检测登录状态
# logined = self.login_manager.check_and_login()
# if logined:
# print("登录成功")
# else:
# print("登录失败")
self.browser_started = True
except Exception as e:
print(f"启动浏览器失败: {e}")
@@ -50,6 +50,47 @@ class AiqichaDetailCrawler:
finally:
self.browser_started = False
def close_svip_popups(self):
"""
关闭页面中的SVIP弹窗
"""
# dialog-close
try:
# 查找所有对话框关闭按钮元素
close_buttons = self.browser.page.query_selector_all('div.dialog-close')
print(f'找到 {len(close_buttons)} 个对话框关闭按钮')
# 遍历所有关闭按钮,点击可见的按钮
for i, button in enumerate(close_buttons):
if button and button.is_visible():
print(f"点击第 {i + 1} 个可见的对话框关闭按钮")
button.click()
time.sleep(1) # 等待对话框关闭
print("已完成关闭可见的对话框")
except Exception as e:
print(f"关闭对话框时出错: {e}")
if True:
return
try:
# 查找所有关闭按钮元素
# <a class="ivu-modal-close"><i class="ivu-icon ivu-icon-ios-close">
close_buttons = self.browser.page.query_selector_all('a.ivu-modal-close')
print(f'找到 {len(close_buttons)} 个关闭按钮')
# 遍历所有关闭按钮,点击可见的按钮
for i, button in enumerate(close_buttons):
if button and button.is_visible():
print(f"点击第 {i + 1} 个可见的关闭按钮")
button.click()
time.sleep(1) # 等待弹窗关闭
print("已完成关闭可见的模态框")
except Exception as e:
print(f"关闭弹窗时出错: {e}")
def crawl_company_detail(self, url: str, refer_url: str = None):
"""
爬取爱企查企业详情页数据
@@ -76,24 +117,63 @@ class AiqichaDetailCrawler:
# 访问页面
if self.browser.visit_page(url):
# 增强页面加载检查
print("等待页面关键元素加载...")
self.close_svip_popups()
# 保存当前页面的html 到 demo/html/aiqicha-datail.html
# 目录不存在时创建
try:
self.browser.page.wait_for_selector('.header-user-center', timeout=10000)
print(f".header-user-center1: 等待页面元素ok")
except Exception as e:
print(f".header-user-center1: 等待页面元素时出错: {e}")
self.browser.save_page_html("demo/html/aiqicha-datail.html")
try:
# 增强页面加载检查
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
self.close_svip_popups()
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups.html")
# 使用登录管理器检测登录状态
login = self.login_manager.check_and_login()
if login:
print("crawl_company_detail登录成功")
else:
print("crawl_company_detail登录失败")
self.browser.save_page_html("demo/html/aiqicha-datail-afterchecklogin.html")
# 关闭指定的弹窗 (aiqicha-datail.html 792-793 行对应的元素)
self.close_svip_popups()
self.browser.save_page_html("demo/html/aiqicha-datail-after-close_svip_popups2.html")
# 等待关键元素加载,增加超时时间
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=15000)
print("等待页面关键元素加载...")
try:
self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
print("crawl_company_detail.addr-enter-bg-ele success")
except Exception as e:
print(f"等待页面元素时出错: {e}")
# self.browser.page.wait_for_selector('.addr-enter-bg-ele', timeout=10000)
print("关键元素已加载")
# 额外等待一段时间确保页面完全加载
import time
time.sleep(2)
time.sleep(3)
print("额外等待完成,页面应该已完全加载")
except Exception as e:
print(f"等待页面元素时出错: {e}")
print("继续尝试解析页面内容...")
self.browser.save_cookies()
# 提取基本信息
print("开始解析页面信息...")
parser = AiqichaDetailParser(self.browser.page)
parser = AiqichaDetailParser(self.browser)
company_info = parser.parse_company_info()
print(f"成功爬取企业信息: {company_info['name']}")
@@ -101,7 +181,6 @@ class AiqichaDetailCrawler:
else:
print("访问页面失败")
return {}
except Exception as e:
print(f"爬取过程中出现错误: {e}")
return {}

View File

@@ -6,14 +6,15 @@ import re
class AiqichaDetailParser:
"""爱企查企业详情页解析器"""
def __init__(self, page):
def __init__(self, browser):
"""
初始化解析器
Args:
page: 浏览器页面对象
browser: 浏览器页面对象
"""
self.page = page
self.browser = browser
# self.self.browser
def parse_company_info(self):
"""
@@ -48,62 +49,111 @@ class AiqichaDetailParser:
}
# 批量提取信息
for field, selectors in fields.items():
company_info[field] = self._extract_field_value(selectors)
# 爱企查页面使用表格结构,需要特殊处理
field_mapping = {
'legal_representative': '法定代表人',
'business_scope': '经营范围',
'credit_code': '统一社会信用代码',
'registered_capital': '注册资本',
'establishment_date': '成立日期',
'business_status': '经营状态',
'company_type': '企业类型',
'registration_authority': '登记机关',
'operating_period': '营业期限',
'address': '注册地址',
'administrative_division': '行政区划',
'business_registration_number': '工商注册号',
'taxpayer_qualification': '纳税人资质',
'approved_date': '核准日期',
'staff_size': '参保人数',
'former_name': '曾用名'
}
for field_name, field_text in field_mapping.items():
company_info[field_name] = self._extract_field_value(field_text)
# 特殊处理电话号码
company_info['phone'] = self._extract_phone_number()
company_info['name'] = self._extract_company_name()
return company_info
def _extract_field_value(self, selectors):
def _extract_company_name(self):
"""
根据多个选择器提取字段值
Args:
selectors (list): CSS选择器列表
提取企业名称
Returns:
str: 提取到的值"未知"
str: 企业名称"未知"
"""
for selector in selectors:
try:
# 添加日志:显示当前尝试的选择器
print(f"尝试选择器: {selector}")
try:
# 尝试多种方式获取企业名称
selectors = [
'title', # 页面标题
'.company-name', # 常见的公司名称类
'h1.enterprise-name', # 企业名称标题
'.company-title' # 其他可能的类名
]
# 尝试查找带有 enter-bg-ele 类的元素
element = self.page.query_selector(f"{selector} .enter-bg-ele")
if element:
print(f"找到 enter-bg-ele 元素,选择器: {selector} .enter-bg-ele")
else:
# 尝试查找带有 addr-enter-bg-ele 类的元素
element = self.page.query_selector(f"{selector} .addr-enter-bg-ele")
for selector in selectors:
try:
element = self.browser.page.query_selector(selector)
if element:
print(f"找到 addr-enter-bg-ele 元素,选择器: {selector} .addr-enter-bg-ele")
else:
# 直接查找元素
element = self.page.query_selector(selector)
if element:
print(f"找到直接元素,选择器: {selector}")
text = element.inner_text().strip()
# 如果是标题,可能需要去除后缀
if selector == 'title' and '-' in text:
text = text.split('-')[0].strip()
text = self._clean_text(text)
if text and text != "未知":
return text
except:
continue
return "未知"
except Exception as e:
print(f"提取企业名称时出错: {e}")
return "未知"
def _extract_field_value(self, field_text):
"""
根据多个选择器提取字段值,适配爱企查实际页面结构
"""
# for field_name, field_text in field_mapping.items():
try:
# 查找包含特定文本的td元素
title_element = self.browser.page.query_selector(f'td:has-text("{field_text}")')
if title_element:
# 获取相邻的td元素包含实际值
value_element = title_element.evaluate_handle('el => el.nextElementSibling')
if value_element:
text = value_element.inner_text().strip()
# 清理文本,移除前缀
if "" in text:
text = text.split("", 1)[1].strip()
# 特殊处理法定代表人字段,去除"TA有X家企业"等额外信息
if field_text == "法定代表人":
# 移除类似"TA有12家企业"的额外信息
text = re.sub(r'\s*TA有\d+家企业.*$', '', text)
# 特殊处理地址字段,去除"查看地图"等额外信息
if field_text == "注册地址":
# 移除"查看地图"等额外信息
text = re.sub(r'\s*查看地图.*$', '', text)
text = re.sub(r'\s*附近企业.*$', '', text)
# 特殊处理参保人数字段,仅保留数字
if field_text == "参保人数":
# 提取数字部分,如"7人" -> "7"
match = re.search(r'(\d+)', text)
if match:
text = match.group(1)
if element:
text = element.inner_text().strip()
print(f"提取到原始文本: '{text}'")
# 清理文本内容
text = self._clean_text(text)
print(f"清理后文本: '{text}'")
if text:
print(f"返回文本: '{text}'")
return text
else:
print("文本为空或仅包含空白字符")
else:
print(f"未找到元素,选择器: {selector}")
except Exception as e:
print(f"提取字段时出错,选择器: {selector}, 错误: {e}")
continue
except Exception as e:
print(f"提取字段 {field_text} 时出错: {e}")
# continue
print("所有选择器都未找到有效元素,返回默认值")
return "未知"
def _clean_text(self, text):
@@ -131,7 +181,7 @@ class AiqichaDetailParser:
"""
try:
# 查找电话信息容器
phone_container = self.page.query_selector("div.business-info div.telphone-lists-wrap")
phone_container = self.browser.page.query_selector("div.business-info div.telphone-lists-wrap")
if phone_container:
# 查找包含电话号码的元素
phone_element = phone_container.query_selector("span.copy-box span")

View File

@@ -16,17 +16,44 @@ class AiqichaLoginManager:
else:
print("已加载Cookie验证登录状态...")
# 加载cookie后访问页面验证是否真正登录
self.browser.page.goto("https://aiqicha.baidu.com")
# 等待页面加载完成
self.browser.page.wait_for_load_state("networkidle")
# 判断当前的url地址 如果是 https://aiqicha.baidu.com/company_detail_* 地址
if "company_detail" in self.browser.page.url:
# 等待页面加载完成
# .header-user-center-menu
try:
# 检测用户中心元素判断已登录
self.browser.page.wait_for_selector('.header-user-center-menu', timeout=30000)
self.browser.save_cookies()
print("header-user-center-menu 检测到已登录状态")
return True
except Exception as e:
print("header-user-center-menu 登录状态检测异常", e)
# return False
self.browser.page.wait_for_load_state("networkidle")
# 验证登录状态
if not self.check_login_status():
print("Cookie已过期或无效重新登录...")
return self.login()
else:
print("Cookie有效已登录")
return True
# 验证登录状态
if not self.check_login_status():
print("Cookie已过期或无效重新登录...")
return self.login()
else:
print("Cookie有效已登录")
return True
print("未登录或已过期,开始登录流程...")
self.browser.page.goto("https://aiqicha.baidu.com")
# 等待页面加载完成
self.browser.page.wait_for_load_state("networkidle")
# 验证登录状态
if not self.check_login_status():
print("Cookie已过期或无效重新登录...")
return self.login()
else:
print("Cookie有效已登录")
return True
def check_login_status(self):
"""检测登录状态返回True表示已登录"""

View File

@@ -6,7 +6,7 @@ from tool.web_browser import WebBrowser
class BingSearcher:
def __init__(self, cookie_path="bing_cookies.json"):
def __init__(self, cookie_path="cookies/bing_cookies.json"):
self.browser = WebBrowser(cookie_path)
self.browser_started = False

View File

@@ -25,6 +25,40 @@ class CSVTool:
writer = csv.writer(f)
writer.writerow(self.headers)
# 文件存在,检查是否为空或只有空行
try:
with open(self.csv_file_name, 'r', encoding='utf-8') as f:
content = f.read()
# 如果文件内容为空或只包含空白字符(空格、换行符等)
if not content.strip():
with open(self.csv_file_name, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(self.headers)
return
# 文件有内容,检查第一行是否为表头
with open(self.csv_file_name, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
first_row = next(reader, None)
# 如果第一行不是预期的表头,则重新写入表头和原有内容
if first_row != self.headers:
lines = content.strip().split('\n')
# 过滤掉空行和只包含空白字符的行
non_empty_lines = [line for line in lines if line.strip()]
with open(self.csv_file_name, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(self.headers)
# 如果有非空内容,则写入
if non_empty_lines:
f.write('\n'.join(non_empty_lines) + '\n')
except Exception as e:
print(f"检查/更新表头时出错: {e}")
def get_existing_data(self, unique_titles: List[str]) -> set:
"""
读取现有数据,用于去重检查

View File

@@ -5,9 +5,18 @@ import json
import os
import time
import urllib.parse
from playwright.async_api import async_playwright
from playwright.sync_api import sync_playwright
def create_directory(directory_path):
"""创建目录(如果不存在)"""
if not os.path.exists(directory_path):
os.makedirs(directory_path)
print(f"已创建目录: {directory_path}")
class WebBrowser:
def __init__(self, cookie_path="browser_cookies.json"):
self.cookie_path = cookie_path
@@ -16,6 +25,58 @@ class WebBrowser:
self.page = None
self.playwright = None
def human_like_actions(self):
"""更逼真的人类行为模拟"""
# 模拟页面加载后的自然浏览行为
time.sleep(random.uniform(1, 3))
# 模拟鼠标悬停
try:
hover_elements = self.page.query_selector_all("a, button")
if hover_elements:
element = random.choice(hover_elements[:min(5, len(hover_elements))])
self.page.mouse.move(0, 0) # 先移开
time.sleep(0.5)
element.hover()
time.sleep(random.uniform(0.5, 1.5))
except:
pass
def get_random_user_agent(self):
"""获取随机User-Agent"""
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
]
return random.choice(user_agents)
def enhanced_anti_detection(self):
"""增强的反检测脚本"""
self.page.add_init_script("""
// 更彻底地隐藏webdriver痕迹
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// 伪装chrome csi和loadTimes
if (!window.chrome) {
window.chrome = {
runtime: {}
};
}
// 伪装permissions查询
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => {
if (parameters.name === 'notifications') {
return Promise.resolve({
state: Notification.permission
});
}
return originalQuery(parameters);
};
""")
def anti_detection(self):
"""注入更全面的反检测脚本"""
self.page.add_init_script("""
@@ -122,6 +183,27 @@ class WebBrowser:
except:
return False
def start_browser_with_proxy(self, proxy_host=None, proxy_port=None):
"""启动带代理的浏览器"""
self.init_cookie_file()
browser_args = [
"--disable-blink-features=AutomationControlled",
"--disable-infobars",
"--no-sandbox",
"--disable-dev-shm-usage"
]
if proxy_host and proxy_port:
browser_args.append(f"--proxy-server=http://{proxy_host}:{proxy_port}")
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(
headless=False,
args=browser_args
)
# ... 其余初始化代码
def bypass_debugger(self):
"""绕过调试器检测"""
self.page.add_init_script("""
@@ -188,10 +270,12 @@ class WebBrowser:
)
self.page = self.context.new_page()
self.anti_detection()
self.enhanced_anti_detection()
# 立即执行一次反检测
self.page.evaluate("""
delete navigator.__proto__.webdriver;
""")
self.human_like_actions()
self.random_behavior()
def close_browser(self):
@@ -203,6 +287,20 @@ class WebBrowser:
if self.playwright:
self.playwright.stop()
def save_page_html(self, filepath):
"""保存当前页面HTML到文件"""
try:
# 获取页面HTML内容
html_content = self.page.content()
# 确保目录存在
create_directory(os.path.dirname(filepath))
# 写入文件
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"页面HTML已保存到: {filepath}")
except Exception as e:
print(f"保存页面HTML失败: {e}")
def visit_page(self, url):
"""访问指定页面"""
try:
@@ -243,7 +341,17 @@ class WebBrowser:
self.page.evaluate("""
delete navigator.__proto__.webdriver;
""")
self.page.wait_for_load_state("networkidle")
# self.page.wait_for_load_state("networkidle")
# 3. 等待页面加载状态而不是特定元素
try:
self.page.wait_for_load_state('networkidle', timeout=5000)
print("networkidle, timeout=5000页面已加载")
except Exception as e:
print(f"等待页面加载状态时出错: {e}")
# self.page.wait_for_load_state('networkidle', timeout=5000)
self.human_like_actions()
self.random_behavior()
return True
except Exception as e:

Some files were not shown because too many files have changed in this diff Show More