Files
SearchCompany/tool/bing_search.py
manchuwork 102dd78c26 aiqicha
2025-09-25 03:19:34 +08:00

153 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# file: tool/bing_search.py
import time
import urllib.parse
from tool.web_browser import WebBrowser
class BingSearcher:
def __init__(self, cookie_path="bing_cookies.json"):
self.browser = WebBrowser(cookie_path)
self.browser_started = False
def start_browser(self):
"""启动浏览器"""
if not self.browser_started:
try:
self.browser.start_browser()
# 加载cookies
if not self.browser.load_cookies():
print("未找到有效Cookie")
else:
print("已加载Cookie")
self.browser_started = True
except Exception as e:
print(f"启动浏览器失败: {e}")
self.browser_started = False
def close_browser(self):
"""关闭浏览器"""
if self.browser_started:
try:
# 保存cookies
self.browser.save_cookies()
self.browser.close_browser()
except Exception as e:
print(f"关闭浏览器时出错: {e}")
finally:
self.browser_started = False
def search(self, keyword, num_pages=1):
"""
在Bing上搜索关键词并返回结果
Args:
keyword (str): 搜索关键词
num_pages (int): 搜索页数默认为1
Returns:
list: 搜索结果列表每个元素包含title、url和request_url
"""
if not self.browser_started:
self.start_browser()
if not self.browser_started:
print("浏览器未启动,无法执行搜索")
return []
print(f'必应爬取任务进行中,爬取页数为{num_pages}...')
all_results = []
try:
# 执行搜索
for page in range(num_pages):
first = page * 10 + 1
url = f"https://cn.bing.com/search?q={urllib.parse.quote(keyword)}&first={first}&count=10&FORM=PERE"
print("正在爬取的url为:" + url)
print('标题 url')
# 访问页面
if self.browser.visit_page(url):
# 提取搜索结果
results = self.browser.extract_links("h2 a")
all_results.extend(results)
# 打印结果
for result in results:
print(result['title'], " ", result['url'])
# 随机延迟,避免请求过快
time.sleep(2)
except Exception as e:
print(f"搜索过程中出现错误: {e}")
count = len(all_results)
print(f"必应搜索爬取结果为{count}")
return all_results
def __enter__(self):
"""上下文管理器入口"""
self.start_browser()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""上下文管理器出口"""
self.close_browser()
# 兼容旧接口的函数
def bing_main(keyword, num='1'):
"""
兼容原有接口的函数
Args:
keyword (str): 搜索关键词
num (str): 搜索页数,支持范围格式如'1:5'
Returns:
tuple: (titles列表, urls列表)
"""
searcher = BingSearcher()
# 解析页数参数
if ':' in num:
if num.count(':') > 1:
raise ValueError("输入中必须且只能包含一个 ':'")
else:
start_page, end_page = num.split(':')
if not (start_page.isdigit() and end_page.isdigit()):
raise ValueError("':' 两侧的值必须是数字")
else:
num_pages = int(end_page) - int(start_page) + 1
else:
num_pages = int(num)
try:
searcher.start_browser()
results = searcher.search(keyword, num_pages)
# 分离titles和urls
titles = [result['title'] for result in results]
urls = [result['url'] for result in results]
return (titles, urls)
finally:
searcher.close_browser()
# 使用示例:
# 方法1: 手动管理浏览器生命周期
# searcher = BingSearcher()
# searcher.start_browser()
# results1 = searcher.search("阿里巴巴", 1)
# results2 = searcher.search("腾讯", 1)
# searcher.close_browser()
# 方法2: 使用上下文管理器
# with BingSearcher() as searcher:
# results1 = searcher.search("阿里巴巴", 1)
# results2 = searcher.search("腾讯", 1)