mirror of
https://github.com/Wcowin/Mkdocs-Wcowin.git
synced 2025-07-20 08:56:35 +00:00
255 lines
10 KiB
Python
255 lines
10 KiB
Python
import re
|
||
from functools import lru_cache
|
||
|
||
# 预编译正则表达式(保持原有格式)
|
||
EXCLUDE_PATTERNS = [
|
||
re.compile(r'^index\.md$'),
|
||
re.compile(r'^about/'),
|
||
re.compile(r'^trip/index\.md$'),
|
||
re.compile(r'^relax/index\.md$'),
|
||
re.compile(r'^blog/indexblog\.md$'),
|
||
re.compile(r'^blog/posts\.md$'),
|
||
re.compile(r'^develop/index\.md$'),
|
||
re.compile(r'waline\.md$'),
|
||
re.compile(r'link\.md$'),
|
||
re.compile(r'404\.md$'),
|
||
]
|
||
|
||
# 优化的字符统计正则表达式
|
||
CHINESE_CHARS_PATTERN = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]')
|
||
CODE_BLOCK_PATTERN = re.compile(r'```.*?```', re.DOTALL)
|
||
INLINE_CODE_PATTERN = re.compile(r'`[^`]+`')
|
||
YAML_FRONT_PATTERN = re.compile(r'^---.*?---\s*', re.DOTALL)
|
||
HTML_TAG_PATTERN = re.compile(r'<[^>]+>')
|
||
IMAGE_PATTERN = re.compile(r'!\[.*?\]\([^)]+\)')
|
||
LINK_PATTERN = re.compile(r'\[([^\]]+)\]\([^)]+\)')
|
||
|
||
# 预定义排除类型
|
||
EXCLUDE_TYPES = frozenset({'landing', 'special', 'widget'})
|
||
|
||
# 扩展非编程行内代码词汇(更全面的过滤)
|
||
NON_CODE_WORDS = frozenset({
|
||
'markdown', 'target', 'blank', 'lg', 'middle', 'small', 'large',
|
||
'left', 'right', 'center', 'top', 'bottom', 'primary', 'secondary',
|
||
'success', 'warning', 'danger', 'info', 'light', 'dark', 'grid',
|
||
'cards', 'octicons', 'bookmark', 'div', 'class', 'img', 'src',
|
||
'alt', 'width', 'height', 'style', 'id', 'data', 'href', 'title'
|
||
})
|
||
|
||
# 支持的编程和标记语言(扩展版本)
|
||
PROGRAMMING_LANGUAGES = frozenset({
|
||
# 编程语言
|
||
'python', 'py', 'javascript', 'js', 'typescript', 'ts', 'java', 'cpp', 'c',
|
||
'go', 'rust', 'php', 'ruby', 'swift', 'kotlin', 'csharp', 'cs',
|
||
# 脚本语言
|
||
'bash', 'sh', 'powershell', 'ps1', 'zsh', 'fish', 'bat', 'cmd',
|
||
# 标记和配置语言
|
||
'html', 'css', 'scss', 'sass', 'less', 'yaml', 'yml', 'json', 'xml',
|
||
'toml', 'ini', 'conf', 'dockerfile', 'makefile',
|
||
# 数据库和查询
|
||
'sql', 'mysql', 'postgresql', 'sqlite', 'mongodb',
|
||
# 其他
|
||
'r', 'matlab', 'scala', 'perl', 'lua', 'dart', 'tex', 'latex',
|
||
# 数据格式
|
||
'csv', 'properties',
|
||
# 无标识符(空字符串也算作有效语言)
|
||
''
|
||
})
|
||
|
||
@lru_cache(maxsize=256)
|
||
def clean_markdown_content_for_chinese(content_hash, markdown):
|
||
"""清理Markdown内容,只保留中文文本用于统计(添加缓存)"""
|
||
content = markdown
|
||
|
||
# 使用预编译的正则表达式
|
||
content = YAML_FRONT_PATTERN.sub('', content)
|
||
content = HTML_TAG_PATTERN.sub('', content)
|
||
content = IMAGE_PATTERN.sub('', content)
|
||
content = LINK_PATTERN.sub(r'\1', content)
|
||
content = CODE_BLOCK_PATTERN.sub('', content)
|
||
content = INLINE_CODE_PATTERN.sub('', content)
|
||
|
||
return content
|
||
|
||
def count_code_lines(markdown):
|
||
"""统计代码行数(修复版本 - 正确处理所有代码行)"""
|
||
code_blocks = CODE_BLOCK_PATTERN.findall(markdown)
|
||
total_code_lines = 0
|
||
|
||
for i, block in enumerate(code_blocks):
|
||
# 提取语言标识
|
||
lang_match = re.match(r'^```(\w*)', block)
|
||
language = lang_match.group(1).lower() if lang_match else ''
|
||
|
||
# 移除开头的语言标识和结尾的```
|
||
code_content = re.sub(r'^```\w*\n?', '', block)
|
||
code_content = re.sub(r'\n?```$', '', code_content)
|
||
|
||
# 过滤空代码块
|
||
if not code_content.strip():
|
||
continue
|
||
|
||
# 计算有效行数(包含所有非空行,包括注释行)
|
||
lines = [line for line in code_content.split('\n') if line.strip()]
|
||
line_count = len(lines)
|
||
|
||
# 如果有明确的编程语言标识,直接统计
|
||
if language and language in PROGRAMMING_LANGUAGES:
|
||
total_code_lines += line_count
|
||
continue
|
||
|
||
# 增强的检测策略 - 更宽松的判断
|
||
is_code = False
|
||
|
||
# 1. 命令行检测
|
||
command_indicators = [
|
||
'sudo ', 'npm ', 'pip ', 'git ', 'cd ', 'ls ', 'mkdir ', 'rm ', 'cp ', 'mv ',
|
||
'chmod ', 'chown ', 'grep ', 'find ', 'ps ', 'kill ', 'top ', 'cat ', 'echo ',
|
||
'wget ', 'curl ', 'tar ', 'zip ', 'unzip ', 'ssh ', 'scp ', 'rsync ',
|
||
'xattr ', 'codesign ', 'xcode-select ', 'spctl ', 'launchctl ',
|
||
'brew ', 'defaults ', 'ditto ', 'hdiutil ', 'diskutil ',
|
||
'dir ', 'copy ', 'xcopy ', 'del ', 'rd ', 'md ', 'type ', 'attrib ',
|
||
'$ ', '# ', '% ', '> ', 'C:\\>', 'PS>',
|
||
'--', '-r', '-d', '-f', '-v', '-h', '--help', '--version',
|
||
'--force', '--deep', '--sign', '--master-disable',
|
||
'/Applications/', '/usr/', '/etc/', '/var/', '/home/', '~/',
|
||
'C:\\', 'D:\\', '.app', '.exe', '.pkg', '.dmg', '.zip', '.tar',
|
||
'#!/',
|
||
]
|
||
|
||
if any(indicator in code_content for indicator in command_indicators):
|
||
is_code = True
|
||
|
||
# 2. 编程语法检测(增强版)
|
||
if not is_code:
|
||
programming_indicators = [
|
||
# Python语法特征
|
||
'def ', 'class ', 'import ', 'from ', 'return ', 'yield ', 'lambda ',
|
||
'with ', 'as ', 'try:', 'except:', 'finally:', 'elif ', 'if __name__',
|
||
'print(', '.append(', '.extend(', '.remove(', '.sort(', '.reverse(',
|
||
'range(', 'len(', 'str(', 'int(', 'float(', 'list(', 'dict(',
|
||
# JavaScript/TypeScript语法
|
||
'function', 'var ', 'let ', 'const ', 'async ', 'await ', '=>',
|
||
'console.log', 'document.', 'window.', 'require(',
|
||
# 通用编程语法
|
||
'public ', 'private ', 'protected ', 'static ', 'void ', 'int ',
|
||
'string ', 'boolean ', 'float ', 'double ', 'char ',
|
||
# 操作符和结构
|
||
'==', '!=', '<=', '>=', '&&', '||', '++', '--', '+=', '-=', '**',
|
||
# 特殊结构
|
||
'while ', 'for ', 'if ', 'else:', 'switch ', 'case ',
|
||
# HTML/XML语法
|
||
'<!DOCTYPE', '<html', '<head', '<body', '<div', '<span', '<p>',
|
||
'<style', '<script', '<link', '<meta', '<title', '<img',
|
||
# CSS语法
|
||
'display:', 'color:', 'background:', 'margin:', 'padding:',
|
||
'font-size:', 'width:', 'height:', 'position:', 'border:',
|
||
# YAML语法
|
||
'name:', 'version:', 'theme:', 'title:', 'description:',
|
||
# JSON语法
|
||
'{"', '"}', '":', '",', '[{', '}]', 'null', 'true', 'false',
|
||
# 配置文件语法
|
||
'[', ']', '//', '/*', '*/', '<!--', '-->',
|
||
# SQL语法
|
||
'SELECT ', 'FROM ', 'WHERE ', 'INSERT ', 'UPDATE ', 'DELETE ',
|
||
'CREATE ', 'ALTER ', 'DROP ', 'INDEX ', 'TABLE ',
|
||
# 数学公式和LaTeX
|
||
'\\', '$', '$$', '\\begin', '\\end', '\\frac', '\\sum',
|
||
]
|
||
|
||
if any(indicator in code_content for indicator in programming_indicators):
|
||
is_code = True
|
||
|
||
# 3. 结构化检测
|
||
if not is_code:
|
||
# 缩进结构检测
|
||
if len(lines) > 1 and any(line.startswith(' ') or line.startswith('\t') for line in lines):
|
||
is_code = True
|
||
|
||
# HTML标签结构
|
||
elif '<' in code_content and '>' in code_content:
|
||
is_code = True
|
||
|
||
# 包含特殊字符组合
|
||
elif any(char in code_content for char in ['{', '}', '(', ')', '[', ']']) and ('=' in code_content or ':' in code_content):
|
||
is_code = True
|
||
|
||
# 4. 模式匹配检测(宽松策略)
|
||
if not is_code and len(lines) >= 1:
|
||
special_patterns = [
|
||
r'\w+\(\)', r'\w+\[\]', r'\w+\{\}', r'\w+=\w+', r'\w+:\w+',
|
||
r'<\w+>', r'\$\w+', r'#\w+', r'@\w+', r'\w+\.\w+\(\)',
|
||
r'\d+\.\d+\.\d+', r'http[s]?://', r'ftp://', r'localhost',
|
||
r'def\s+\w+', r'class\s+\w+', r'import\s+\w+', r'from\s+\w+',
|
||
r'if\s+\w+', r'while\s+\w+', r'for\s+\w+', r'return\s+\w*',
|
||
r'\w+\s*=\s*\w+', r'\w+\.\w+', r'#.*输出', r'#.*结果'
|
||
]
|
||
|
||
if any(re.search(pattern, code_content) for pattern in special_patterns):
|
||
is_code = True
|
||
|
||
# 如果判断为代码,则统计行数
|
||
if is_code:
|
||
total_code_lines += line_count
|
||
|
||
return total_code_lines
|
||
|
||
def calculate_reading_stats(markdown):
|
||
"""计算中文字符数和代码行数"""
|
||
# 生成内容哈希用于缓存
|
||
content_hash = hash(markdown)
|
||
|
||
# 使用缓存的清理函数
|
||
clean_content = clean_markdown_content_for_chinese(content_hash, markdown)
|
||
chinese_chars = len(CHINESE_CHARS_PATTERN.findall(clean_content))
|
||
|
||
# 统计代码行数
|
||
code_lines = count_code_lines(markdown)
|
||
|
||
# 计算阅读时间(中文:400字/分钟)
|
||
reading_time = max(1, round(chinese_chars / 400))
|
||
|
||
return reading_time, chinese_chars, code_lines
|
||
|
||
def on_page_markdown(markdown, **kwargs):
|
||
page = kwargs['page']
|
||
|
||
# 快速排除检查
|
||
if page.meta.get('hide_reading_time', False):
|
||
return markdown
|
||
|
||
# 保持原有的EXCLUDE_PATTERNS循环检查方式
|
||
src_path = page.file.src_path
|
||
for pattern in EXCLUDE_PATTERNS:
|
||
if pattern.match(src_path):
|
||
return markdown
|
||
|
||
# 优化类型检查
|
||
page_type = page.meta.get('type', '')
|
||
if page_type in EXCLUDE_TYPES:
|
||
return markdown
|
||
|
||
# 快速预检查
|
||
if len(markdown) < 300:
|
||
return markdown
|
||
|
||
# 计算统计信息
|
||
reading_time, chinese_chars, code_lines = calculate_reading_stats(markdown)
|
||
|
||
# 过滤太短的内容
|
||
if chinese_chars < 50:
|
||
return markdown
|
||
|
||
# 生成阅读信息
|
||
if code_lines > 0:
|
||
reading_info = f"""!!! info "📖 阅读信息"
|
||
阅读时间:**{reading_time}** 分钟 | 中文字符:**{chinese_chars}** | 有效代码行数:**{code_lines}**
|
||
|
||
"""
|
||
else:
|
||
reading_info = f"""!!! info "📖 阅读信息"
|
||
阅读时间:**{reading_time}** 分钟 | 中文字符:**{chinese_chars}**
|
||
|
||
"""
|
||
|
||
return reading_info + markdown |