Mkdocs-Wcowin/overrides/hooks/ai_summary.py

1068 lines
45 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from dotenv import load_dotenv
load_dotenv() # 自动加载 .env 文件
import re
import json
import hashlib
import requests
from pathlib import Path
from datetime import datetime
import os
import shutil
class AISummaryGenerator:
def __init__(self):
# 🗂️ 统一缓存路径策略 - 本地和CI环境都使用项目根目录
# 这样避免了CI构建时被清理也简化了路径管理
self.cache_dir = Path(".ai_cache")
self.cache_dir.mkdir(parents=True, exist_ok=True)
# 🚀 CI 环境配置 - 默认只在 CI 环境中启用
# AI摘要环境配置
self.ci_config = {
# CI部署环境开关 (不用管只在ci.yml中设置有效)
'enabled_in_ci': os.getenv('AI_SUMMARY_CI_ENABLED', 'true').lower() == 'true',
# 本地部署环境开关 (true=本地开发时启用AI摘要)
'enabled_in_local': os.getenv('AI_SUMMARY_LOCAL_ENABLED', 'false').lower() == 'true',
# CI部署仅缓存模式(不用管只在ci.yml中设置有效)
'ci_only_cache': os.getenv('AI_SUMMARY_CI_ONLY_CACHE', 'false').lower() == 'true',
# 本地部署缓存功能开关 (true=启用缓存避免重复生成, false=总是生成新摘要)
'cache_enabled': os.getenv('AI_SUMMARY_CACHE_ENABLED', 'true').lower() == 'true',
# CI部署备用摘要开关 (不用管只在ci.yml中设置有效)
'ci_fallback_enabled': os.getenv('AI_SUMMARY_CI_FALLBACK', 'true').lower() == 'true',
}
# 🔄 自动缓存迁移逻辑(一次性迁移旧缓存) - 移到ci_config初始化之后
self._auto_migrate_cache()
# 添加服务配置文件,用于跟踪当前使用的服务
self.service_config_file = self.cache_dir / "service_config.json"
# 🤖 多AI服务配置
self.ai_services = {
'glm': {
'url': 'https://open.bigmodel.cn/api/paas/v4/chat/completions',
'model': 'glm-4-flash', # 或 'glm-4-plus', 'glm-4-air'
'api_key': os.getenv('GLM_API_KEY', ),
'max_tokens': 300,
'temperature': 0.3
},
'openai': {
'url': 'https://api.chatanywhere.tech/v1/chat/completions',
'model': 'gpt-3.5-turbo', # 或 'gpt-4', 'gpt-4-turbo'
'api_key': os.getenv('OPENAI_API_KEY', ),
'max_tokens': 150,
'temperature': 0.3
},
# 'claude': {
# 'url': 'https://api.anthropic.com/v1/messages',
# 'model': 'claude-3-haiku-20240307',
# 'api_key': os.getenv('ANTHROPIC_API_KEY', 'your-claude-api-key'),
# 'max_tokens': 150,
# 'temperature': 0.3
# },
'gemini': {
'url': 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent',
'model': 'gemini-pro',
'api_key': os.getenv('GOOGLE_API_KEY', 'AIzaSyDwWgffCCyVFZVsRasX3B3arWFaCT1PzNI'),
'max_tokens': 150,
'temperature': 0.3
}
}
# 默认使用的AI服务
self.default_service = 'glm'
# 服务优先级(按顺序尝试)
self.service_fallback_order = ['glm', 'openai', 'claude', 'gemini']
# 📂 可自定义的文件夹配置
self.enabled_folders = [
'blog/', # blog文件夹
'develop/', # develop文件夹
# 'posts/', # posts文件夹
# 'trip/', # trip文件夹
# 'about/', # about文件夹
]
# 📋 排除的文件和文件夹
self.exclude_patterns = [
'waline.md', 'link.md', '404.md', 'tag.md', 'tags.md',
'/about/', '/search/', '/sitemap', '/admin/',
'index.md', # 根目录index.md
]
# 📋 排除的特定文件
self.exclude_files = [
'blog/index.md',
'blog/indexblog.md',
'docs/index.md',
'develop/index.md',
]
# 🌍 语言配置/Language Configuration
self.summary_language = 'zh' # 默认中文,可选 'zh'、'en'、'both'
# 在初始化时就进行环境检查
self._check_environment()
# 检查服务变更并处理缓存
self._check_service_change()
def _check_environment(self):
"""初始化时检查环境"""
is_ci = self.is_ci_environment()
if is_ci:
ci_name = self._get_ci_name()
if self.ci_config['enabled_in_ci']:
print(f"🚀 检测到 CI 环境 ({ci_name})AI 摘要功能已启用")
self._should_run = True
else:
print(f"🚫 检测到 CI 环境 ({ci_name})AI 摘要功能已禁用")
self._should_run = False
else:
# 本地环境检查
if self.ci_config['enabled_in_local']:
print("💻 本地环境检测到AI 摘要功能已启用")
self._should_run = True
else:
print("🚫 本地环境检测到AI 摘要功能已禁用(仅在 CI 环境中启用)")
self._should_run = False
def _check_service_change(self):
"""检查AI服务是否发生变更如有变更则自动清理缓存"""
# 如果禁用了缓存功能,跳过服务变更检查
if not self.ci_config['cache_enabled']:
return
current_config = {
'default_service': self.default_service,
'available_services': list(self.ai_services.keys()),
'summary_language': self.summary_language,
'check_time': datetime.now().isoformat()
}
if self.service_config_file.exists():
try:
with open(self.service_config_file, 'r', encoding='utf-8') as f:
previous_config = json.load(f)
# 检查默认服务或语言是否变更
if (previous_config.get('default_service') != current_config['default_service'] or
previous_config.get('summary_language') != current_config['summary_language']):
old_service = previous_config.get('default_service', 'unknown')
new_service = current_config['default_service']
old_lang = previous_config.get('summary_language', 'zh')
new_lang = current_config['summary_language']
if old_service != new_service:
print(f"🔄 检测到AI服务变更: {old_service}{new_service}")
if old_lang != new_lang:
print(f"🌍 检测到语言变更: {old_lang}{new_lang}")
print("🧹 自动清理AI摘要缓存...")
try:
# 删除整个缓存目录
if self.cache_dir.exists():
shutil.rmtree(self.cache_dir)
print(f"✅ 已删除缓存文件夹: {self.cache_dir}")
# 重新创建缓存目录
self.cache_dir.mkdir(exist_ok=True)
print("📁 已重新创建缓存目录")
except Exception as e:
print(f"❌ 清理缓存失败: {e}")
# 如果删除失败,尝试清理单个文件
try:
self._clear_cache_files()
except:
print("⚠️ 缓存清理失败,新摘要可能会混用旧配置的缓存")
except Exception as e:
print(f"读取服务配置失败: {e}")
# 保存当前配置
try:
with open(self.service_config_file, 'w', encoding='utf-8') as f:
json.dump(current_config, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"保存服务配置失败: {e}")
def _clear_cache_files(self):
"""清理缓存文件(备用方法)"""
cleared_count = 0
try:
for cache_file in self.cache_dir.glob("*.json"):
if cache_file.name != "service_config.json":
cache_file.unlink()
cleared_count += 1
print(f"✅ 已清理 {cleared_count} 个缓存文件")
except Exception as e:
print(f"❌ 单文件清理失败: {e}")
def configure_ai_service(self, service_name, config=None):
"""
配置AI服务
Args:
service_name: 服务名称 ('glm', 'openai', 'azure_openai', 'claude', 'gemini')
config: 服务配置字典
"""
old_service = self.default_service
if config:
self.ai_services[service_name] = config
self.default_service = service_name
# 如果服务发生变更,自动清理缓存
if old_service != service_name:
print(f"🔄 AI服务已切换: {old_service}{service_name}")
print("🧹 自动清理所有AI摘要缓存...")
try:
if self.cache_dir.exists():
shutil.rmtree(self.cache_dir)
print(f"✅ 已删除缓存文件夹: {self.cache_dir}")
# 重新创建缓存目录
self.cache_dir.mkdir(exist_ok=True)
print("📁 已重新创建缓存目录")
except Exception as e:
print(f"❌ 清理缓存失败: {e}")
# 如果删除失败,尝试清理单个文件
try:
self._clear_cache_files()
except:
print("⚠️ 缓存清理失败,新摘要可能会混用旧服务的缓存")
# 更新服务配置记录
self._check_service_change()
def configure_language(self, language='zh'):
"""
配置摘要语言
Args:
language: 语言设置 ('zh': 中文, 'en': 英文, 'both': 双语)
"""
old_language = self.summary_language
self.summary_language = language
if old_language != language:
print(f"🌍 摘要语言已切换: {old_language}{language}")
print("🧹 自动清理摘要缓存以应用新语言设置...")
try:
if self.cache_dir.exists():
shutil.rmtree(self.cache_dir)
print(f"✅ 已删除缓存文件夹: {self.cache_dir}")
# 重新创建缓存目录
self.cache_dir.mkdir(exist_ok=True)
print("📁 已重新创建缓存目录")
except Exception as e:
print(f"❌ 清理缓存失败: {e}")
# 更新服务配置记录
self._check_service_change()
def configure_folders(self, folders=None, exclude_patterns=None, exclude_files=None):
"""配置启用AI摘要的文件夹"""
if folders is not None:
self.enabled_folders = folders
if exclude_patterns is not None:
self.exclude_patterns = exclude_patterns
if exclude_files is not None:
self.exclude_files = exclude_files
def get_content_hash(self, content):
"""生成内容hash用于缓存包含语言设置"""
content_with_lang = f"{content}_{self.summary_language}"
return hashlib.md5(content_with_lang.encode('utf-8')).hexdigest()
def get_cached_summary(self, content_hash):
"""获取缓存的摘要"""
# 如果禁用了缓存功能直接返回None
if not self.ci_config['cache_enabled']:
return None
cache_file = self.cache_dir / f"{content_hash}.json"
if cache_file.exists():
try:
with open(cache_file, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
# 检查缓存是否过期7天
cache_time = datetime.fromisoformat(cache_data.get('timestamp', '1970-01-01'))
if (datetime.now() - cache_time).days < 7:
return cache_data
except:
pass
return None
def save_summary_cache(self, content_hash, summary_data):
"""保存摘要到缓存"""
# 如果禁用了缓存功能,不保存缓存
if not self.ci_config['cache_enabled']:
return
cache_file = self.cache_dir / f"{content_hash}.json"
try:
summary_data['timestamp'] = datetime.now().isoformat()
summary_data['language'] = self.summary_language
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(summary_data, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"保存摘要缓存失败: {e}")
def clean_content_for_ai(self, markdown):
"""清理内容提取主要文本用于AI处理"""
content = markdown
# 移除YAML front matter
content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
# 移除已存在的阅读信息块和AI摘要块
content = re.sub(r'!!! info "📖 阅读信息".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
content = re.sub(r'!!! info "🤖 AI智能摘要".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
content = re.sub(r'!!! tip "📝 自动摘要".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
# 移除HTML标签
content = re.sub(r'<[^>]+>', '', content)
# 移除图片保留alt文本作为内容提示
content = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', content)
# 移除链接,保留文本
content = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', content)
# 移除代码块,但保留关键信息
content = re.sub(r'```(\w+)?\n(.*?)\n```', r'[代码示例]', content, flags=re.DOTALL)
# 移除行内代码
content = re.sub(r'`[^`]+`', '[代码]', content)
# 移除表格格式但保留内容
content = re.sub(r'\|[^\n]+\|', '', content)
content = re.sub(r'^[-|:\s]+$', '', content, flags=re.MULTILINE)
# 清理格式符号
content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content) # 粗体
content = re.sub(r'\*([^*]+)\*', r'\1', content) # 斜体
content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE) # 标题符号
# 移除多余的空行和空格
content = re.sub(r'\n\s*\n', '\n\n', content)
content = re.sub(r'^[ \t]+', '', content, flags=re.MULTILINE)
content = content.strip()
return content
def build_headers(self, service_config):
"""构建请求头"""
headers = {
'Content-Type': 'application/json'
}
# 根据服务类型添加认证头
if 'azure_openai' in service_config.get('url', ''):
headers['api-key'] = service_config['api_key']
elif 'anthropic.com' in service_config.get('url', ''):
headers['x-api-key'] = service_config['api_key']
headers['anthropic-version'] = '2023-06-01'
elif 'googleapis.com' in service_config.get('url', ''):
# Google API使用URL参数
pass
else:
# OpenAI和GLM使用Bearer token
headers['Authorization'] = f"Bearer {service_config['api_key']}"
# 添加额外的头部
if 'headers_extra' in service_config:
headers.update(service_config['headers_extra'])
return headers
def build_payload(self, service_name, service_config, content, page_title):
"""构建请求载荷"""
# 根据语言设置生成不同的prompt
if self.summary_language == 'en':
prompt = f"""Please generate a high-quality summary for the following technical article with these requirements:
1. **Length Control**: Strictly limit to 80-120 words
2. **Content Requirements**:
- Accurately summarize the core themes and key points of the article
- Highlight technical features, application scenarios, or problems solved
- Use professional but understandable language
- Avoid repeating the article title content
3. **Format Requirements**:
- Return summary content directly without any prefix or suffix
- Use concise declarative sentences
- Technical terms are appropriate
Article Title: {page_title}
Article Content:
{content[:2500]}
Please generate summary:"""
elif self.summary_language == 'both':
prompt = f"""Please generate a bilingual summary (Chinese and English) for the following technical article with these requirements:
1. **Length Control**:
- Chinese: 80-120 characters
- English: 80-120 words
2. **Content Requirements**:
- Accurately summarize the core themes and key points
- Highlight technical features, application scenarios, or problems solved
- Use professional but understandable language
- Avoid repeating the article title content
3. **Format Requirements**:
- First provide Chinese summary
- Then provide English summary
- Separate with a blank line
- No prefixes or additional formatting
Article Title: {page_title}
Article Content:
{content[:2500]}
Please generate bilingual summary:"""
else: # 默认中文
prompt = f"""请为以下技术文章生成一个高质量的摘要,要求:
1. **长度控制**严格控制在80-120字以内
2. **内容要求**
- 准确概括文章的核心主题和关键要点
- 突出技术特点、应用场景或解决的问题
- 使用专业但易懂的语言
- 避免重复文章标题的内容
3. **格式要求**
- 直接返回摘要内容,无需任何前缀或后缀
- 使用简洁的陈述句
- 可以适当使用技术术语
文章标题:{page_title}
文章内容:
{content[:2500]}
请生成摘要:"""
if service_name == 'claude':
# Claude API格式
return {
"model": service_config['model'],
"max_tokens": service_config['max_tokens'],
"temperature": service_config['temperature'],
"messages": [
{
"role": "user",
"content": prompt
}
]
}
elif service_name == 'gemini':
# Gemini API格式
return {
"contents": [
{
"parts": [
{
"text": prompt
}
]
}
],
"generationConfig": {
"temperature": service_config['temperature'],
"maxOutputTokens": service_config['max_tokens']
}
}
else:
# OpenAI格式 (OpenAI, GLM, Azure OpenAI)
system_content = {
'zh': "你是一个专业的技术文档摘要专家,擅长提取文章核心要点并生成简洁准确的中文摘要。",
'en': "You are a professional technical documentation summary expert, skilled at extracting core points from articles and generating concise and accurate English summaries.",
'both': "You are a professional technical documentation summary expert, skilled at extracting core points from articles and generating concise and accurate bilingual summaries in both Chinese and English."
}
return {
"model": service_config['model'],
"messages": [
{
"role": "system",
"content": system_content.get(self.summary_language, system_content['zh'])
},
{
"role": "user",
"content": prompt
}
],
"max_tokens": service_config['max_tokens'] * (2 if self.summary_language == 'both' else 1),
"temperature": service_config['temperature'],
"top_p": 0.9
}
def extract_response_content(self, service_name, response_data):
"""从响应中提取内容"""
try:
if service_name == 'claude':
return response_data['content'][0]['text']
elif service_name == 'gemini':
return response_data['candidates'][0]['content']['parts'][0]['text']
else:
# OpenAI格式
return response_data['choices'][0]['message']['content']
except (KeyError, IndexError) as e:
print(f"解析{service_name}响应失败: {e}")
return None
def generate_ai_summary_with_service(self, content, page_title, service_name):
"""使用指定服务生成摘要"""
if service_name not in self.ai_services:
print(f"不支持的AI服务: {service_name}")
return None
service_config = self.ai_services[service_name]
# 检查API密钥
if not service_config['api_key'] or service_config['api_key'].startswith('your-'):
print(f"{service_name} API密钥未配置")
return None
try:
headers = self.build_headers(service_config)
payload = self.build_payload(service_name, service_config, content, page_title)
# 对于Google API添加API密钥到URL
url = service_config['url']
if service_name == 'gemini':
url = f"{url}?key={service_config['api_key']}"
response = requests.post(
url,
headers=headers,
json=payload,
timeout=30
)
if response.status_code == 200:
result = response.json()
summary = self.extract_response_content(service_name, result)
if summary:
# 清理可能的格式问题
summary = re.sub(r'^["""''`]+|["""''`]+$', '', summary.strip())
summary = re.sub(r'^\s*摘要[:]\s*', '', summary)
summary = re.sub(r'^\s*总结[:]\s*', '', summary)
summary = re.sub(r'^\s*Summary[:]\s*', '', summary)
summary = re.sub(r'^\s*Abstract[:]\s*', '', summary)
return summary
else:
print(f"{service_name} API请求失败: {response.status_code} - {response.text}")
return None
except requests.exceptions.RequestException as e:
print(f"{service_name} API请求异常: {e}")
return None
except Exception as e:
print(f"{service_name} 摘要生成异常: {e}")
return None
def generate_ai_summary(self, content, page_title=""):
"""生成AI摘要支持CI环境策略"""
is_ci = self.is_ci_environment()
# 如果在 CI 环境中且配置为只使用缓存
if is_ci and self.ci_config['ci_only_cache']:
print(f"📦 CI 环 environment仅使用缓存模式")
return None, 'ci_cache_only'
# 按优先级尝试不同服务
services_to_try = [self.default_service] + [s for s in self.service_fallback_order if s != self.default_service]
for service_name in services_to_try:
if service_name in self.ai_services:
lang_desc = {'zh': '中文', 'en': '英文', 'both': '双语'}
env_desc = '(CI)' if is_ci else '(本地)'
print(f"🔄 尝试使用 {service_name} 生成{lang_desc.get(self.summary_language, '中文')}摘要 {env_desc}...")
summary = self.generate_ai_summary_with_service(content, page_title, service_name)
if summary:
return summary, service_name
print("⚠️ 所有AI服务均不可用")
return None, None
def generate_fallback_summary(self, content, page_title=""):
"""生成备用摘要考虑CI环境配置"""
is_ci = self.is_ci_environment()
# 如果在 CI 环境中且禁用了备用摘要
if is_ci and not self.ci_config['ci_fallback_enabled']:
print(f"🚫 CI 环境禁用备用摘要")
return None
# 移除格式符号
clean_text = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', clean_text)
clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
# 分割成句子
sentences = re.split(r'[\u3002\uff01\uff1f.!?]', clean_text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
# 优先选择包含关键词的句子
key_indicators = [
'介绍', '讲解', '说明', '分析', '探讨', '研究', '实现', '应用',
'方法', '技术', '算法', '原理', '概念', '特点', '优势', '解决',
'教程', '指南', '配置', '安装', '部署', '开发', '设计', '构建'
]
priority_sentences = []
normal_sentences = []
for sentence in sentences[:10]: # 处理前10句
if any(keyword in sentence for keyword in key_indicators):
priority_sentences.append(sentence)
else:
normal_sentences.append(sentence)
# 组合摘要
selected_sentences = []
total_length = 0
# 优先使用关键句子
for sentence in priority_sentences:
if total_length + len(sentence) > 100:
break
selected_sentences.append(sentence)
total_length += len(sentence)
# 如果还有空间,添加普通句子
if total_length < 80:
for sentence in normal_sentences:
if total_length + len(sentence) > 100:
break
selected_sentences.append(sentence)
total_length += len(sentence)
if selected_sentences:
summary = '.'.join(selected_sentences) + '.'
# 简化冗长的摘要
if len(summary) > 120:
summary = selected_sentences[0] + '.'
# 根据语言设置生成不同的备用摘要
if self.summary_language == 'en':
return self._generate_english_fallback(page_title)
elif self.summary_language == 'both':
zh_summary = summary
en_summary = self._generate_english_fallback(page_title)
return f"{zh_summary}\n\n{en_summary}"
else:
return summary
else:
# 根据标题和语言生成通用摘要
if self.summary_language == 'en':
return self._generate_english_fallback(page_title)
elif self.summary_language == 'both':
zh_summary = self._generate_chinese_fallback(page_title)
en_summary = self._generate_english_fallback(page_title)
return f"{zh_summary}\n\n{en_summary}"
else:
return self._generate_chinese_fallback(page_title)
def _generate_chinese_fallback(self, page_title=""):
"""生成中文备用摘要"""
if page_title:
# 根据标题生成通用摘要
if any(keyword in page_title for keyword in ['教程', '指南', '配置', '安装']):
return f"本文介绍了{page_title}的相关内容,包括具体的操作步骤和注意事项,为读者提供实用的技术指导。"
elif any(keyword in page_title for keyword in ['分析', '研究', '探讨', '原理']):
return f"本文深入分析了{page_title}的核心概念和技术原理,为读者提供详细的理论解析和实践见解。"
elif any(keyword in page_title for keyword in ['开发', '构建', '实现', '设计']):
return f"本文详细讲解了{page_title}的开发过程和实现方法,分享了实际的开发经验和技术方案。"
else:
return f"本文围绕{page_title}展开讨论,介绍了相关的技术概念、应用场景和实践方法。"
else:
return "本文介绍了相关的技术概念和实践方法,为读者提供有价值的参考信息。"
def _generate_english_fallback(self, page_title=""):
"""生成英文备用摘要"""
if page_title:
# 根据标题生成通用摘要
if any(keyword in page_title.lower() for keyword in ['tutorial', 'guide', 'setup', 'install', 'config']):
return f"This article provides a comprehensive guide on {page_title}, including step-by-step instructions and important considerations for practical implementation."
elif any(keyword in page_title.lower() for keyword in ['analysis', 'research', 'study', 'principle']):
return f"This article presents an in-depth analysis of {page_title}, exploring core concepts and technical principles with detailed theoretical insights."
elif any(keyword in page_title.lower() for keyword in ['develop', 'build', 'implement', 'design']):
return f"This article explains the development process and implementation methods for {page_title}, sharing practical development experience and technical solutions."
else:
return f"This article discusses {page_title}, covering relevant technical concepts, application scenarios, and practical methods."
else:
return "This article introduces relevant technical concepts and practical methods, providing valuable reference information for readers."
def is_ci_environment(self):
"""检测是否在 CI 环境中运行"""
# 常见的 CI 环境变量
ci_indicators = [
'CI', 'CONTINUOUS_INTEGRATION', # 通用 CI 标识
'GITHUB_ACTIONS', # GitHub Actions
'GITLAB_CI', # GitLab CI
'JENKINS_URL', # Jenkins
'TRAVIS', # Travis CI
'CIRCLECI', # CircleCI
'AZURE_HTTP_USER_AGENT', # Azure DevOps
'TEAMCITY_VERSION', # TeamCity
'BUILDKITE', # Buildkite
'CODEBUILD_BUILD_ID', # AWS CodeBuild
'NETLIFY', # Netlify
'VERCEL', # Vercel
'CF_PAGES', # Cloudflare Pages
]
for indicator in ci_indicators:
if os.getenv(indicator):
return True
return False
def should_run_in_current_environment(self):
"""判断是否应该在当前环境中运行 AI 摘要"""
return self._should_run
def _get_ci_name(self):
"""获取 CI 环境名称"""
if os.getenv('GITHUB_ACTIONS'):
return 'GitHub Actions'
elif os.getenv('GITLAB_CI'):
return 'GitLab CI'
elif os.getenv('JENKINS_URL'):
return 'Jenkins'
elif os.getenv('TRAVIS'):
return 'Travis CI'
elif os.getenv('CIRCLECI'):
return 'CircleCI'
elif os.getenv('AZURE_HTTP_USER_AGENT'):
return 'Azure DevOps'
elif os.getenv('NETLIFY'):
return 'Netlify'
elif os.getenv('VERCEL'):
return 'Vercel'
elif os.getenv('CF_PAGES'):
return 'Cloudflare Pages'
elif os.getenv('CODEBUILD_BUILD_ID'):
return 'AWS CodeBuild'
else:
return 'Unknown CI'
def _auto_migrate_cache(self):
"""自动迁移缓存文件(仅在需要时执行一次)"""
# 如果禁用了缓存功能,跳过缓存迁移
if not self.ci_config.get('cache_enabled', True):
return
old_cache_dir = Path("site/.ai_cache")
new_cache_dir = Path(".ai_cache")
# 检查是否需要迁移
if old_cache_dir.exists() and not new_cache_dir.exists():
print("🔄 检测到旧缓存目录,开始自动迁移...")
try:
# 创建新目录
new_cache_dir.mkdir(exist_ok=True)
# 复制文件
cache_files = list(old_cache_dir.glob("*.json"))
copied_count = 0
for cache_file in cache_files:
target_file = new_cache_dir / cache_file.name
try:
shutil.copy2(cache_file, target_file)
copied_count += 1
except Exception as e:
print(f"⚠️ 复制缓存文件失败 {cache_file.name}: {e}")
if copied_count > 0:
print(f"✅ 自动迁移完成!共迁移 {copied_count} 个缓存文件")
print("💡 提示:请将 .ai_cache 目录提交到 Git 仓库")
else:
print(" 没有缓存文件需要迁移")
except Exception as e:
print(f"❌ 自动迁移失败: {e}")
elif new_cache_dir.exists():
# 新缓存目录已存在,检查是否有文件
cache_files = list(new_cache_dir.glob("*.json"))
if cache_files:
is_ci = self.is_ci_environment()
env_desc = '(CI)' if is_ci else '(本地)'
print(f"📦 发现根目录缓存 {env_desc},共 {len(cache_files)} 个缓存文件")
def process_page(self, markdown, page, config):
"""处理页面生成AI摘要支持CI环境检测"""
# 检查是否应该在当前环境运行
if not self.should_run_in_current_environment():
return markdown
if not self.should_generate_summary(page, markdown):
return markdown
clean_content = self.clean_content_for_ai(markdown)
# 内容长度检查
if len(clean_content) < 100:
print(f"📄 内容太短,跳过摘要生成: {page.file.src_path}")
return markdown
content_hash = self.get_content_hash(clean_content)
page_title = getattr(page, 'title', '')
is_ci = self.is_ci_environment()
# 检查缓存
cached_summary = self.get_cached_summary(content_hash)
if cached_summary:
summary = cached_summary.get('summary', '')
ai_service = cached_summary.get('service', 'cached')
env_desc = '(CI)' if is_ci else '(本地)'
print(f"✅ 使用缓存摘要 {env_desc}: {page.file.src_path}")
else:
# 如果在 CI 环境中且配置为只使用缓存,直接跳过摘要生成
if is_ci and self.ci_config['ci_only_cache']:
print(f"📦 CI 环境仅使用缓存模式,无缓存可用,跳过摘要生成: {page.file.src_path}")
return markdown
# 生成新摘要
lang_desc = {'zh': '中文', 'en': '英文', 'both': '双语'}
env_desc = '(CI)' if is_ci else '(本地)'
print(f"🤖 正在生成{lang_desc.get(self.summary_language, '中文')}AI摘要 {env_desc}: {page.file.src_path}")
summary, ai_service = self.generate_ai_summary(clean_content, page_title)
if not summary:
# 尝试生成备用摘要
summary = self.generate_fallback_summary(clean_content, page_title)
if summary:
ai_service = 'fallback'
print(f"📝 使用备用摘要 {env_desc}: {page.file.src_path}")
else:
print(f"❌ 无法生成摘要 {env_desc}: {page.file.src_path}")
return markdown
else:
print(f"✅ AI摘要生成成功 ({ai_service}) {env_desc}: {page.file.src_path}")
# 保存到缓存
if summary:
self.save_summary_cache(content_hash, {
'summary': summary,
'service': ai_service,
'page_title': page_title
})
# 添加摘要到页面最上面
if summary:
summary_html = self.format_summary(summary, ai_service)
return summary_html + '\n\n' + markdown
else:
return markdown
def should_generate_summary(self, page, markdown):
"""判断是否应该生成摘要"""
# 检查页面元数据
if hasattr(page, 'meta'):
# 明确禁用
if page.meta.get('ai_summary') == False:
return False
# 强制启用
if page.meta.get('ai_summary') == True:
return True
# 获取文件路径
src_path = page.file.src_path.replace('\\', '/') # 统一路径分隔符
# 检查排除模式
if any(pattern in src_path for pattern in self.exclude_patterns):
return False
# 检查排除的特定文件
if src_path in self.exclude_files:
return False
# 检查是否在启用的文件夹中
for folder in self.enabled_folders:
if src_path.startswith(folder) or f'/{folder}' in src_path:
folder_name = folder.rstrip('/')
lang_desc = {'zh': '中文', 'en': '英文', 'both': '双语'}
print(f"🎯 {folder_name}文件夹文章检测到,启用{lang_desc.get(self.summary_language, '中文')}AI摘要: {src_path}")
return True
# 默认不生成摘要
return False
def format_summary(self, summary, ai_service):
"""格式化摘要显示包含CI环境标识"""
# 根据语言设置显示不同的标题
service_names = {
'zh': {
'glm': 'AI智能摘要 (智谱清言)',
'openai': 'AI智能摘要 (ChatGPT)',
'azure_openai': 'AI智能摘要 (Azure OpenAI)',
'claude': 'AI智能摘要 (Claude)',
'gemini': 'AI智能摘要 (Gemini)',
'fallback': '自动摘要',
'cached': 'AI智能摘要',
'ci_cache_only': 'AI智能摘要 (缓存)'
},
'en': {
'glm': 'AI Summary (GLM)',
'openai': 'AI Summary (ChatGPT)',
'azure_openai': 'AI Summary (Azure OpenAI)',
'claude': 'AI Summary (Claude)',
'gemini': 'AI Summary (Gemini)',
'fallback': 'Auto Summary',
'cached': 'AI Summary',
'ci_cache_only': 'AI Summary (Cached)'
},
'both': {
'glm': 'AI智能摘要 / AI Summary (GLM)',
'openai': 'AI智能摘要 / AI Summary (ChatGPT)',
'azure_openai': 'AI智能摘要 / AI Summary (Azure OpenAI)',
'claude': 'AI智能摘要 / AI Summary (Claude)',
'gemini': 'AI智能摘要 / AI Summary (Gemini)',
'fallback': '自动摘要 / Auto Summary',
'cached': 'AI智能摘要 / AI Summary',
'ci_cache_only': 'AI智能摘要 / AI Summary (缓存)'
}
}
name_config = service_names.get(self.summary_language, service_names['zh'])
service_name = name_config.get(ai_service, name_config['fallback'])
# 图标和颜色配置
icon = '💾' if ai_service not in ['fallback', 'ci_cache_only'] else '📝'
color = 'info' if ai_service not in ['fallback', 'ci_cache_only'] else 'tip'
return f'''!!! {color} "{icon} {service_name}"
{summary}
'''
# 创建全局实例
ai_summary_generator = AISummaryGenerator()
# 🔧 配置函数
def configure_ai_summary(enabled_folders=None, exclude_patterns=None, exclude_files=None,
ai_service=None, service_config=None, language='zh',
ci_enabled=None, local_enabled=None, ci_only_cache=None, ci_fallback=None, cache_enabled=None):
"""
配置AI摘要功能支持CI和本地环境分别配置
Args:
enabled_folders: 启用AI摘要的文件夹列表
exclude_patterns: 排除的模式列表
exclude_files: 排除的特定文件列表
ai_service: 使用的AI服务 ('glm', 'openai', 'claude', 'gemini')
service_config: AI服务配置
language: 摘要语言 ('zh': 中文, 'en': 英文, 'both': 双语)
ci_enabled: 是否在 CI 环境中启用
local_enabled: 是否在本地环境中启用
ci_only_cache: CI 环境是否仅使用缓存
ci_fallback: CI 环境是否启用备用摘要
cache_enabled: 是否启用缓存功能
Example:
# 本地开发时禁用缓存,总是生成新摘要
configure_ai_summary(
enabled_folders=['blog/', 'docs/'],
language='zh',
local_enabled=True,
cache_enabled=False # 禁用缓存
)
# CI中启用缓存本地禁用缓存
configure_ai_summary(
enabled_folders=['blog/', 'docs/'],
language='zh',
ci_enabled=True,
local_enabled=True,
ci_only_cache=True, # CI仅使用缓存
cache_enabled=True # 启用缓存功能
)
"""
ai_summary_generator.configure_folders(enabled_folders, exclude_patterns, exclude_files)
ai_summary_generator.configure_language(language)
# 配置环境行为
if any(x is not None for x in [ci_enabled, local_enabled, ci_only_cache, ci_fallback, cache_enabled]):
configure_ci_behavior(ci_enabled, local_enabled, ci_only_cache, ci_fallback, cache_enabled)
if ai_service:
if service_config:
# 合并配置
current_config = ai_summary_generator.ai_services.get(ai_service, {})
current_config.update(service_config)
ai_summary_generator.configure_ai_service(ai_service, current_config)
else:
ai_summary_generator.configure_ai_service(ai_service)
# 🔧 新增 CI 配置函数
def configure_ci_behavior(enabled_in_ci=None, enabled_in_local=None, ci_only_cache=None, ci_fallback_enabled=None, cache_enabled=None):
"""
配置 CI 和本地环境行为
Args:
enabled_in_ci: 是否在 CI 环境中启用 AI 摘要
enabled_in_local: 是否在本地环境中启用 AI 摘要
ci_only_cache: CI 环境是否仅使用缓存
ci_fallback_enabled: CI 环境是否启用备用摘要
cache_enabled: 是否启用缓存功能默认True
Example:
# 完全禁用缓存
configure_ci_behavior(cache_enabled=False)
# 本地开发时禁用缓存,总是生成新摘要
configure_ci_behavior(enabled_in_local=True, cache_enabled=False)
# CI中使用缓存本地禁用缓存
configure_ci_behavior(enabled_in_ci=True, enabled_in_local=True, ci_only_cache=True, cache_enabled=True)
"""
if enabled_in_ci is not None:
ai_summary_generator.ci_config['enabled_in_ci'] = enabled_in_ci
print(f"✅ CI 环境 AI 摘要: {'启用' if enabled_in_ci else '禁用'}")
if enabled_in_local is not None:
ai_summary_generator.ci_config['enabled_in_local'] = enabled_in_local
print(f"✅ 本地环境 AI 摘要: {'启用' if enabled_in_local else '禁用'}")
if ci_only_cache is not None:
ai_summary_generator.ci_config['ci_only_cache'] = ci_only_cache
print(f"✅ CI 环境仅缓存模式: {'启用' if ci_only_cache else '禁用'}")
if ci_fallback_enabled is not None:
ai_summary_generator.ci_config['ci_fallback_enabled'] = ci_fallback_enabled
print(f"✅ CI 环境备用摘要: {'启用' if ci_fallback_enabled else '禁用'}")
if cache_enabled is not None:
ai_summary_generator.ci_config['cache_enabled'] = cache_enabled
print(f"✅ 缓存功能: {'启用' if cache_enabled else '禁用'}")
def on_page_markdown(markdown, page, config, files):
"""MkDocs hook入口点"""
return ai_summary_generator.process_page(markdown, page, config)