From dd368bb64502196ff5475219f66deb9a53e62c67 Mon Sep 17 00:00:00 2001 From: manchuwork Date: Fri, 3 Oct 2025 07:45:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9aiqicha=20login=E9=80=BB?= =?UTF-8?q?=E8=BE=91=EF=BC=8C=20cookie=E6=96=87=E4=BB=B6=E4=BD=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 25 +++++++++++++++++++++++++ cookies/.gitignore | 1 + crawler_bing_main.py | 2 +- data/.gitignore | 1 + tool/aiqicha_detail.py | 12 ++++++------ tool/aiqicha_login.py | 33 ++++++++++++++++++++++++--------- tool/bing_search.py | 2 +- 7 files changed, 59 insertions(+), 17 deletions(-) create mode 100644 .gitignore create mode 100644 cookies/.gitignore create mode 100644 data/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2298ef0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ + +.gitignore +.git/ +.idea/ +.vscode/ +.DS_Store +.env +.env.local +.env.development.local +.env.test.local +.env.production.local +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +node_modules/ +dist/ +build/ +.next/ +.out/ +.serverless/ +cdk.out/ +.vscode-test/ +cookies/ +data/ \ No newline at end of file diff --git a/cookies/.gitignore b/cookies/.gitignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/cookies/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/crawler_bing_main.py b/crawler_bing_main.py index c0fc358..5037a5c 100644 --- a/crawler_bing_main.py +++ b/crawler_bing_main.py @@ -346,7 +346,7 @@ if __name__ == '__main__': filter_list_with_not_match = filter_aiqicha_qcc(data_list, company_name, with_not_match=True) # 创建CSV工具实例 csv_tool = CSVTool( - csv_file_name='company_search_filter_is_none_data.csv', + csv_file_name='data/company_search_filter_is_none_data.csv', headers=['company_name','title', 'web_site_type','url', 'request_url', 'create_time'] ) diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/tool/aiqicha_detail.py b/tool/aiqicha_detail.py index 39cd6e8..6238c81 100644 --- a/tool/aiqicha_detail.py +++ b/tool/aiqicha_detail.py @@ -7,7 +7,7 @@ from tool.aiqicha_login import AiqichaLoginManager # 导入登录管理器 from tool.aiqicha_detail_parser import AiqichaDetailParser # 导入解析器 class AiqichaDetailCrawler: - def __init__(self, cookie_path="aiqicha_cookies.json"): + def __init__(self, cookie_path="cookies/aiqicha_cookies.json"): self.browser = WebBrowser(cookie_path) self.browser_started = False self.login_manager = None # 添加登录管理器实例 @@ -28,11 +28,11 @@ class AiqichaDetailCrawler: print("已加载Cookie") # 使用登录管理器检测登录状态 - # logined = self.login_manager.check_and_login() - # if logined: - # print("登录成功") - # else: - # print("登录失败") + logined = self.login_manager.check_and_login() + if logined: + print("登录成功") + else: + print("登录失败") self.browser_started = True except Exception as e: print(f"启动浏览器失败: {e}") diff --git a/tool/aiqicha_login.py b/tool/aiqicha_login.py index 6f82a68..8fa71c0 100644 --- a/tool/aiqicha_login.py +++ b/tool/aiqicha_login.py @@ -16,17 +16,32 @@ class AiqichaLoginManager: else: print("已加载Cookie,验证登录状态...") # 加载cookie后访问页面验证是否真正登录 - self.browser.page.goto("https://aiqicha.baidu.com") - # 等待页面加载完成 - self.browser.page.wait_for_load_state("networkidle") + # 判断当前的url地址 如果是 https://aiqicha.baidu.com/company_detail_* 地址 + if "company_detail" in self.browser.page.url: + # 等待页面加载完成 + self.browser.page.wait_for_load_state("networkidle") + + # 验证登录状态 + if not self.check_login_status(): + print("Cookie已过期或无效,重新登录...") + return self.login() + else: + print("Cookie有效,已登录") + return True - # 验证登录状态 - if not self.check_login_status(): - print("Cookie已过期或无效,重新登录...") - return self.login() else: - print("Cookie有效,已登录") - return True + print("未登录或已过期,开始登录流程...") + self.browser.page.goto("https://aiqicha.baidu.com") + # 等待页面加载完成 + self.browser.page.wait_for_load_state("networkidle") + + # 验证登录状态 + if not self.check_login_status(): + print("Cookie已过期或无效,重新登录...") + return self.login() + else: + print("Cookie有效,已登录") + return True def check_login_status(self): """检测登录状态,返回True表示已登录""" diff --git a/tool/bing_search.py b/tool/bing_search.py index 5f6b01f..8323948 100644 --- a/tool/bing_search.py +++ b/tool/bing_search.py @@ -6,7 +6,7 @@ from tool.web_browser import WebBrowser class BingSearcher: - def __init__(self, cookie_path="bing_cookies.json"): + def __init__(self, cookie_path="cookies/bing_cookies.json"): self.browser = WebBrowser(cookie_path) self.browser_started = False