# file: tool/bing_search.py import time import urllib.parse from tool.web_browser import WebBrowser class BingSearcher: def __init__(self, cookie_path="bing_cookies.json"): self.browser = WebBrowser(cookie_path) self.browser_started = False def start_browser(self): """启动浏览器""" if not self.browser_started: try: self.browser.start_browser() # 加载cookies if not self.browser.load_cookies(): print("未找到有效Cookie") else: print("已加载Cookie") self.browser_started = True except Exception as e: print(f"启动浏览器失败: {e}") self.browser_started = False def close_browser(self): """关闭浏览器""" if self.browser_started: try: # 保存cookies self.browser.save_cookies() self.browser.close_browser() except Exception as e: print(f"关闭浏览器时出错: {e}") finally: self.browser_started = False def search(self, keyword, num_pages=1): """ 在Bing上搜索关键词并返回结果 Args: keyword (str): 搜索关键词 num_pages (int): 搜索页数,默认为1 Returns: list: 搜索结果列表,每个元素包含title、url和request_url """ if not self.browser_started: self.start_browser() if not self.browser_started: print("浏览器未启动,无法执行搜索") return [] print(f'必应爬取任务进行中,爬取页数为{num_pages}...') all_results = [] try: # 执行搜索 for page in range(num_pages): first = page * 10 + 1 url = f"https://cn.bing.com/search?q={urllib.parse.quote(keyword)}&first={first}&count=10&FORM=PERE" print("正在爬取的url为:" + url) print('标题 url') # 访问页面 if self.browser.visit_page(url): # 提取搜索结果 results = self.browser.extract_links("h2 a") all_results.extend(results) # 打印结果 for result in results: print(result['title'], " ", result['url']) # 随机延迟,避免请求过快 time.sleep(2) except Exception as e: print(f"搜索过程中出现错误: {e}") count = len(all_results) print(f"必应搜索爬取结果为{count}") return all_results def __enter__(self): """上下文管理器入口""" self.start_browser() return self def __exit__(self, exc_type, exc_val, exc_tb): """上下文管理器出口""" self.close_browser() # 兼容旧接口的函数 def bing_main(keyword, num='1'): """ 兼容原有接口的函数 Args: keyword (str): 搜索关键词 num (str): 搜索页数,支持范围格式如'1:5' Returns: tuple: (titles列表, urls列表) """ searcher = BingSearcher() # 解析页数参数 if ':' in num: if num.count(':') > 1: raise ValueError("输入中必须且只能包含一个 ':'") else: start_page, end_page = num.split(':') if not (start_page.isdigit() and end_page.isdigit()): raise ValueError("':' 两侧的值必须是数字") else: num_pages = int(end_page) - int(start_page) + 1 else: num_pages = int(num) try: searcher.start_browser() results = searcher.search(keyword, num_pages) # 分离titles和urls titles = [result['title'] for result in results] urls = [result['url'] for result in results] return (titles, urls) finally: searcher.close_browser() # 使用示例: # 方法1: 手动管理浏览器生命周期 # searcher = BingSearcher() # searcher.start_browser() # results1 = searcher.search("阿里巴巴", 1) # results2 = searcher.search("腾讯", 1) # searcher.close_browser() # 方法2: 使用上下文管理器 # with BingSearcher() as searcher: # results1 = searcher.search("阿里巴巴", 1) # results2 = searcher.search("腾讯", 1)