153 lines
4.5 KiB
Python
153 lines
4.5 KiB
Python
# file: tool/bing_search.py
|
||
|
||
import time
|
||
import urllib.parse
|
||
from tool.web_browser import WebBrowser
|
||
|
||
|
||
class BingSearcher:
|
||
def __init__(self, cookie_path="cookies/bing_cookies.json"):
|
||
self.browser = WebBrowser(cookie_path)
|
||
self.browser_started = False
|
||
|
||
def start_browser(self):
|
||
"""启动浏览器"""
|
||
if not self.browser_started:
|
||
try:
|
||
self.browser.start_browser()
|
||
# 加载cookies
|
||
if not self.browser.load_cookies():
|
||
print("未找到有效Cookie")
|
||
else:
|
||
print("已加载Cookie")
|
||
self.browser_started = True
|
||
except Exception as e:
|
||
print(f"启动浏览器失败: {e}")
|
||
self.browser_started = False
|
||
|
||
def close_browser(self):
|
||
"""关闭浏览器"""
|
||
if self.browser_started:
|
||
try:
|
||
# 保存cookies
|
||
self.browser.save_cookies()
|
||
self.browser.close_browser()
|
||
except Exception as e:
|
||
print(f"关闭浏览器时出错: {e}")
|
||
finally:
|
||
self.browser_started = False
|
||
|
||
def search(self, keyword, num_pages=1):
|
||
"""
|
||
在Bing上搜索关键词并返回结果
|
||
|
||
Args:
|
||
keyword (str): 搜索关键词
|
||
num_pages (int): 搜索页数,默认为1
|
||
|
||
Returns:
|
||
list: 搜索结果列表,每个元素包含title、url和request_url
|
||
"""
|
||
if not self.browser_started:
|
||
self.start_browser()
|
||
|
||
if not self.browser_started:
|
||
print("浏览器未启动,无法执行搜索")
|
||
return []
|
||
|
||
print(f'必应爬取任务进行中,爬取页数为{num_pages}...')
|
||
|
||
|
||
all_results = []
|
||
|
||
try:
|
||
# 执行搜索
|
||
for page in range(num_pages):
|
||
first = page * 10 + 1
|
||
url = f"https://cn.bing.com/search?q={urllib.parse.quote(keyword)}&first={first}&count=10&FORM=PERE"
|
||
|
||
print("正在爬取的url为:" + url)
|
||
print('标题 url')
|
||
# 访问页面
|
||
if self.browser.visit_page(url):
|
||
# 提取搜索结果
|
||
results = self.browser.extract_links("h2 a")
|
||
all_results.extend(results)
|
||
|
||
# 打印结果
|
||
for result in results:
|
||
print(result['title'], " ", result['url'])
|
||
|
||
# 随机延迟,避免请求过快
|
||
time.sleep(2)
|
||
|
||
except Exception as e:
|
||
print(f"搜索过程中出现错误: {e}")
|
||
|
||
count = len(all_results)
|
||
print(f"必应搜索爬取结果为{count}")
|
||
return all_results
|
||
|
||
def __enter__(self):
|
||
"""上下文管理器入口"""
|
||
self.start_browser()
|
||
return self
|
||
|
||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
"""上下文管理器出口"""
|
||
self.close_browser()
|
||
|
||
|
||
# 兼容旧接口的函数
|
||
def bing_main(keyword, num='1'):
|
||
"""
|
||
兼容原有接口的函数
|
||
|
||
Args:
|
||
keyword (str): 搜索关键词
|
||
num (str): 搜索页数,支持范围格式如'1:5'
|
||
|
||
Returns:
|
||
tuple: (titles列表, urls列表)
|
||
"""
|
||
searcher = BingSearcher()
|
||
|
||
# 解析页数参数
|
||
if ':' in num:
|
||
if num.count(':') > 1:
|
||
raise ValueError("输入中必须且只能包含一个 ':'")
|
||
else:
|
||
start_page, end_page = num.split(':')
|
||
if not (start_page.isdigit() and end_page.isdigit()):
|
||
raise ValueError("':' 两侧的值必须是数字")
|
||
else:
|
||
num_pages = int(end_page) - int(start_page) + 1
|
||
else:
|
||
num_pages = int(num)
|
||
|
||
try:
|
||
searcher.start_browser()
|
||
results = searcher.search(keyword, num_pages)
|
||
|
||
# 分离titles和urls
|
||
titles = [result['title'] for result in results]
|
||
urls = [result['url'] for result in results]
|
||
|
||
return (titles, urls)
|
||
|
||
finally:
|
||
searcher.close_browser()
|
||
|
||
# 使用示例:
|
||
# 方法1: 手动管理浏览器生命周期
|
||
# searcher = BingSearcher()
|
||
# searcher.start_browser()
|
||
# results1 = searcher.search("阿里巴巴", 1)
|
||
# results2 = searcher.search("腾讯", 1)
|
||
# searcher.close_browser()
|
||
|
||
# 方法2: 使用上下文管理器
|
||
# with BingSearcher() as searcher:
|
||
# results1 = searcher.search("阿里巴巴", 1)
|
||
# results2 = searcher.search("腾讯", 1)
|