mirror of
https://github.com/Wcowin/Mkdocs-Wcowin.git
synced 2025-07-20 08:56:35 +00:00
393 lines
15 KiB
Python
393 lines
15 KiB
Python
import re
|
||
import json
|
||
import hashlib
|
||
import requests
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
class AISummaryGenerator:
|
||
def __init__(self):
|
||
self.cache_dir = Path("site/.ai_cache")
|
||
self.cache_dir.mkdir(exist_ok=True)
|
||
|
||
# DeepSeek API配置
|
||
self.api_config = {
|
||
'url': 'https://api.deepseek.com/v1/chat/completions',
|
||
'model': 'deepseek-chat',
|
||
'headers': {
|
||
'Content-Type': 'application/json',
|
||
'Authorization': 'Bearer sk-7dbcd6e21fb3417299b50aecff76c7bf'
|
||
}
|
||
}
|
||
|
||
# 📂 可自定义的文件夹配置
|
||
self.enabled_folders = [
|
||
# 'blog/', # blog文件夹
|
||
'develop/', # develop文件夹
|
||
# 在这里添加您想要启用AI摘要的文件夹
|
||
]
|
||
|
||
# 📋 排除的文件和文件夹
|
||
self.exclude_patterns = [
|
||
'waline.md', 'link.md', '404.md', 'tag.md', 'tags.md',
|
||
'/about/', '/search/', '/sitemap', 'index.md', # 根目录index.md
|
||
]
|
||
|
||
# 📋 排除的特定文件
|
||
self.exclude_files = [
|
||
'blog/index.md',
|
||
'blog/indexblog.md',
|
||
'docs/index.md',
|
||
'develop/index.md',
|
||
]
|
||
|
||
def configure_folders(self, folders=None, exclude_patterns=None, exclude_files=None):
|
||
"""
|
||
配置启用AI摘要的文件夹
|
||
|
||
Args:
|
||
folders: 启用AI摘要的文件夹列表
|
||
exclude_patterns: 排除的模式列表
|
||
exclude_files: 排除的特定文件列表
|
||
"""
|
||
if folders is not None:
|
||
self.enabled_folders = folders
|
||
if exclude_patterns is not None:
|
||
self.exclude_patterns = exclude_patterns
|
||
if exclude_files is not None:
|
||
self.exclude_files = exclude_files
|
||
|
||
def get_content_hash(self, content):
|
||
"""生成内容hash用于缓存"""
|
||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||
|
||
def get_cached_summary(self, content_hash):
|
||
"""获取缓存的摘要"""
|
||
cache_file = self.cache_dir / f"{content_hash}.json"
|
||
if cache_file.exists():
|
||
try:
|
||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||
cache_data = json.load(f)
|
||
# 检查缓存是否过期(7天)
|
||
cache_time = datetime.fromisoformat(cache_data.get('timestamp', '1970-01-01'))
|
||
if (datetime.now() - cache_time).days < 7:
|
||
return cache_data
|
||
except:
|
||
pass
|
||
return None
|
||
|
||
def save_summary_cache(self, content_hash, summary_data):
|
||
"""保存摘要到缓存"""
|
||
cache_file = self.cache_dir / f"{content_hash}.json"
|
||
try:
|
||
summary_data['timestamp'] = datetime.now().isoformat()
|
||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||
json.dump(summary_data, f, ensure_ascii=False, indent=2)
|
||
except Exception as e:
|
||
print(f"保存摘要缓存失败: {e}")
|
||
|
||
def clean_content_for_ai(self, markdown):
|
||
"""清理内容,提取主要文本用于AI处理"""
|
||
content = markdown
|
||
|
||
# 移除YAML front matter
|
||
content = re.sub(r'^---.*?---\s*', '', content, flags=re.DOTALL)
|
||
|
||
# 移除已存在的阅读信息块和AI摘要块
|
||
content = re.sub(r'!!! info "📖 阅读信息".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
|
||
content = re.sub(r'!!! info "🤖 AI智能摘要".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
|
||
content = re.sub(r'!!! tip "📝 自动摘要".*?(?=\n\n|\n#|\Z)', '', content, flags=re.DOTALL)
|
||
|
||
# 移除HTML标签
|
||
content = re.sub(r'<[^>]+>', '', content)
|
||
|
||
# 移除图片,保留alt文本作为内容提示
|
||
content = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', content)
|
||
|
||
# 移除链接,保留文本
|
||
content = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', content)
|
||
|
||
# 移除代码块,但保留关键信息
|
||
content = re.sub(r'```(\w+)?\n(.*?)\n```', r'[代码示例]', content, flags=re.DOTALL)
|
||
|
||
# 移除行内代码
|
||
content = re.sub(r'`[^`]+`', '[代码]', content)
|
||
|
||
# 移除表格格式但保留内容
|
||
content = re.sub(r'\|[^\n]+\|', '', content)
|
||
content = re.sub(r'^[-|:\s]+$', '', content, flags=re.MULTILINE)
|
||
|
||
# 清理格式符号
|
||
content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content) # 粗体
|
||
content = re.sub(r'\*([^*]+)\*', r'\1', content) # 斜体
|
||
content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE) # 标题符号
|
||
|
||
# 移除多余的空行和空格
|
||
content = re.sub(r'\n\s*\n', '\n\n', content)
|
||
content = re.sub(r'^[ \t]+', '', content, flags=re.MULTILINE)
|
||
content = content.strip()
|
||
|
||
return content
|
||
|
||
def generate_ai_summary(self, content, page_title=""):
|
||
"""使用DeepSeek生成摘要"""
|
||
# 优化的提示词
|
||
prompt = f"""请为以下技术文章生成一个高质量的摘要,要求:
|
||
|
||
1. **长度控制**:严格控制在80-120字以内
|
||
2. **内容要求**:
|
||
- 准确概括文章的核心主题和关键要点
|
||
- 突出技术特点、应用场景或解决的问题
|
||
- 使用专业但易懂的语言
|
||
- 避免重复文章标题的内容
|
||
3. **格式要求**:
|
||
- 直接返回摘要内容,无需任何前缀或后缀
|
||
- 使用简洁的陈述句
|
||
- 可以适当使用技术术语
|
||
|
||
文章标题:{page_title}
|
||
|
||
文章内容:
|
||
{content[:2500]}
|
||
|
||
请生成摘要:"""
|
||
|
||
try:
|
||
payload = {
|
||
"model": self.api_config['model'],
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": "你是一个专业的技术文档摘要专家,擅长提取文章核心要点并生成简洁准确的摘要。"
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": prompt
|
||
}
|
||
],
|
||
"max_tokens": 150,
|
||
"temperature": 0.3, # 降低随机性,提高准确性
|
||
"top_p": 0.9
|
||
}
|
||
|
||
response = requests.post(
|
||
self.api_config['url'],
|
||
headers=self.api_config['headers'],
|
||
json=payload,
|
||
timeout=30
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
result = response.json()
|
||
summary = result['choices'][0]['message']['content'].strip()
|
||
|
||
# 清理可能的格式问题
|
||
summary = re.sub(r'^["""''`]+|["""''`]+$', '', summary)
|
||
summary = re.sub(r'^\s*摘要[::]\s*', '', summary)
|
||
summary = re.sub(r'^\s*总结[::]\s*', '', summary)
|
||
|
||
return summary
|
||
else:
|
||
print(f"DeepSeek API请求失败: {response.status_code} - {response.text}")
|
||
return None
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"DeepSeek API请求异常: {e}")
|
||
return None
|
||
except Exception as e:
|
||
print(f"AI摘要生成异常: {e}")
|
||
return None
|
||
|
||
def generate_fallback_summary(self, content, page_title=""):
|
||
"""生成备用摘要(基于规则的智能摘要)"""
|
||
# 移除格式符号
|
||
clean_text = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
|
||
clean_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', clean_text)
|
||
clean_text = re.sub(r'\*([^*]+)\*', r'\1', clean_text)
|
||
|
||
# 分割成句子
|
||
sentences = re.split(r'[。!?.!?]', clean_text)
|
||
sentences = [s.strip() for s in sentences if len(s.strip()) > 15]
|
||
|
||
# 优先选择包含关键词的句子
|
||
key_indicators = [
|
||
'介绍', '讲解', '说明', '分析', '探讨', '研究', '实现', '应用',
|
||
'方法', '技术', '算法', '原理', '概念', '特点', '优势', '解决',
|
||
'教程', '指南', '配置', '安装', '部署', '开发', '设计', '构建'
|
||
]
|
||
|
||
priority_sentences = []
|
||
normal_sentences = []
|
||
|
||
for sentence in sentences[:10]: # 处理前10句
|
||
if any(keyword in sentence for keyword in key_indicators):
|
||
priority_sentences.append(sentence)
|
||
else:
|
||
normal_sentences.append(sentence)
|
||
|
||
# 组合摘要
|
||
selected_sentences = []
|
||
total_length = 0
|
||
|
||
# 优先使用关键句子
|
||
for sentence in priority_sentences:
|
||
if total_length + len(sentence) > 100:
|
||
break
|
||
selected_sentences.append(sentence)
|
||
total_length += len(sentence)
|
||
|
||
# 如果还有空间,添加普通句子
|
||
if total_length < 80:
|
||
for sentence in normal_sentences:
|
||
if total_length + len(sentence) > 100:
|
||
break
|
||
selected_sentences.append(sentence)
|
||
total_length += len(sentence)
|
||
|
||
if selected_sentences:
|
||
summary = '。'.join(selected_sentences) + '。'
|
||
# 简化冗长的摘要
|
||
if len(summary) > 120:
|
||
summary = selected_sentences[0] + '。'
|
||
return summary
|
||
else:
|
||
# 根据标题生成通用摘要
|
||
if any(keyword in page_title for keyword in ['教程', '指南', 'Tutorial']):
|
||
return '本文提供了详细的教程指南,通过实例演示帮助读者掌握相关技术要点。'
|
||
elif any(keyword in page_title for keyword in ['配置', '设置', '安装', 'Config']):
|
||
return '本文介绍了系统配置的方法和步骤,提供实用的设置建议和最佳实践。'
|
||
elif any(keyword in page_title for keyword in ['开发', '编程', 'Development']):
|
||
return '本文分享了开发经验和技术实践,提供了实用的代码示例和解决方案。'
|
||
else:
|
||
return '本文深入探讨了相关技术内容,提供了实用的方法和解决方案。'
|
||
|
||
def process_page(self, markdown, page, config):
|
||
"""处理页面,生成AI摘要"""
|
||
if not self.should_generate_summary(page, markdown):
|
||
return markdown
|
||
|
||
clean_content = self.clean_content_for_ai(markdown)
|
||
|
||
# 内容长度检查
|
||
if len(clean_content) < 200:
|
||
print(f"📄 内容太短,跳过摘要生成: {page.file.src_path}")
|
||
return markdown
|
||
|
||
content_hash = self.get_content_hash(clean_content)
|
||
page_title = getattr(page, 'title', '')
|
||
|
||
# 检查缓存
|
||
cached_summary = self.get_cached_summary(content_hash)
|
||
if cached_summary:
|
||
summary = cached_summary.get('summary', '')
|
||
ai_service = 'cached'
|
||
print(f"✅ 使用缓存摘要: {page.file.src_path}")
|
||
else:
|
||
# 生成新摘要
|
||
print(f"🤖 正在生成AI摘要: {page.file.src_path}")
|
||
summary = self.generate_ai_summary(clean_content, page_title)
|
||
|
||
if not summary:
|
||
summary = self.generate_fallback_summary(clean_content, page_title)
|
||
ai_service = 'fallback'
|
||
print(f"📝 使用备用摘要: {page.file.src_path}")
|
||
else:
|
||
ai_service = 'deepseek'
|
||
print(f"✅ AI摘要生成成功: {page.file.src_path}")
|
||
|
||
# 保存到缓存
|
||
self.save_summary_cache(content_hash, {
|
||
'summary': summary,
|
||
'service': ai_service,
|
||
'page_title': page_title
|
||
})
|
||
|
||
# 添加摘要到页面最上面
|
||
summary_html = self.format_summary(summary, ai_service)
|
||
return summary_html + '\n\n' + markdown
|
||
|
||
def should_generate_summary(self, page, markdown):
|
||
"""判断是否应该生成摘要 - 可自定义文件夹"""
|
||
# 检查页面元数据
|
||
if hasattr(page, 'meta'):
|
||
# 明确禁用
|
||
if page.meta.get('ai_summary') == False:
|
||
return False
|
||
|
||
# 强制启用
|
||
if page.meta.get('ai_summary') == True:
|
||
return True
|
||
|
||
# 获取文件路径
|
||
src_path = page.file.src_path.replace('\\', '/') # 统一路径分隔符
|
||
|
||
# 检查排除模式
|
||
if any(pattern in src_path for pattern in self.exclude_patterns):
|
||
return False
|
||
|
||
# 检查排除的特定文件
|
||
if src_path in self.exclude_files:
|
||
return False
|
||
|
||
# 检查是否在启用的文件夹中
|
||
for folder in self.enabled_folders:
|
||
if src_path.startswith(folder) or f'/{folder}' in src_path:
|
||
folder_name = folder.rstrip('/')
|
||
print(f"🎯 {folder_name}文件夹文章检测到,启用AI摘要: {src_path}")
|
||
return True
|
||
|
||
# 默认不生成摘要
|
||
return False
|
||
|
||
def format_summary(self, summary, ai_service):
|
||
"""格式化摘要显示"""
|
||
service_config = {
|
||
'deepseek': {
|
||
'icon': '🤖',
|
||
'name': 'AI智能摘要',
|
||
'color': 'info'
|
||
},
|
||
'fallback': {
|
||
'icon': '📝',
|
||
'name': '自动摘要',
|
||
'color': 'tip'
|
||
},
|
||
'cached': {
|
||
'icon': '💾',
|
||
'name': 'AI智能摘要',
|
||
'color': 'info'
|
||
}
|
||
}
|
||
|
||
config = service_config.get(ai_service, service_config['deepseek'])
|
||
|
||
return f'''!!! {config['color']} "{config['icon']} {config['name']}"
|
||
{summary}
|
||
|
||
'''
|
||
|
||
# 创建全局实例
|
||
ai_summary_generator = AISummaryGenerator()
|
||
|
||
# 🔧 自定义配置函数
|
||
def configure_ai_summary(enabled_folders=None, exclude_patterns=None, exclude_files=None):
|
||
"""
|
||
配置AI摘要功能
|
||
|
||
Args:
|
||
enabled_folders: 启用AI摘要的文件夹列表,例如 ['blog/', 'docs/', 'posts/']
|
||
exclude_patterns: 排除的模式列表,例如 ['404.md', '/admin/']
|
||
exclude_files: 排除的特定文件列表,例如 ['blog/index.md']
|
||
|
||
Example:
|
||
# 只在blog和docs文件夹启用
|
||
configure_ai_summary(['blog/', 'docs/'])
|
||
|
||
# 在所有文件夹启用,但排除特定文件
|
||
configure_ai_summary([''], exclude_files=['index.md', 'about.md'])
|
||
"""
|
||
ai_summary_generator.configure_folders(enabled_folders, exclude_patterns, exclude_files)
|
||
|
||
def on_page_markdown(markdown, page, config, files):
|
||
"""MkDocs hook入口点"""
|
||
return ai_summary_generator.process_page(markdown, page, config) |