跳转到内容
回响之地
返回

AxMath 自定义 OCR - 使用 GLM 永久免费模型实现 LaTeX 公式识别

前言

AxMath 是一款优秀的数学公式编辑器,但其内置的 OCR 功能需要付费。本文介绍如何使用智谱 AI 的 GLM-4V-Flash 永久免费模型,为 AxMath 打造自定义 OCR 服务,实现图片公式到 LaTeX 的高效转换。

为什么选择 GLM-4V-Flash?

实现代码

以下是完整的 Python 服务代码:

import os
import sys
import time
import base64
import logging
import traceback
import re
from openai import OpenAI

try:
    import pyperclip
except ImportError:
    pyperclip = None

# ================= 🌋 智谱AI配置区域 =================
# 🔴 请替换为你自己的智谱AI API_KEY
# 获取地址:https://open.bigmodel.cn/
API_KEY = "你的智谱AI API_KEY"

# 🟢 建议保持这个标准名称
# 即使官方发布了新版本,这个 ID 通常也会指向最新的 Flash 版本
MODEL_NAME = "glm-4v-flash"

BASE_URL = "https://open.bigmodel.cn/api/paas/v4"
# =======================================================

BASE_PATH = r"D:\OCR\AxMath_Bridge"
PATH_IMG = os.path.join(BASE_PATH, "src.png")
PATH_RES = os.path.join(BASE_PATH, "res.tex")
LOG_FILE = os.path.join(BASE_PATH, "service.log")

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S',
    encoding='utf-8'
)

def encode_image(image_path):
    """将图片编码为 base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def call_llm_ocr(image_path):
    """调用 GLM-4V-Flash 进行 OCR 识别"""
    try:
        client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
        base64_image = encode_image(image_path)

        instruction_text = (
            "You are a professional LaTeX OCR assistant. "
            "Convert the image content into standard AMS-LaTeX format."
            "\nRules:"
            "\n1. Return ONLY the raw LaTeX code."
            "\n2. Use \\left| and \\right\\rangle for Dirac notation. DO NOT use \\Big|."
            "\n3. Keep Chinese text as is."
            "\n4. Accuracy is the highest priority."
            "\n\nTask: Extract content to LaTeX:"
        )

        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": instruction_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}},
                    ],
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        if "1210" in str(e):
            raise Exception("参数错误 (1210) - 模型名称或参数不支持")
        elif "401" in str(e):
            raise Exception("API Key 错误 (401)")
        else:
            raise e

def clean_latex(text):
    """清洗和格式化 LaTeX 代码"""
    if not text:
        return ""
    text = str(text).strip()

    # 移除 markdown 代码块标记
    text = text.replace("```latex", "").replace("```markdown", "").replace("```", "")

    # 修复 \Big| 为标准格式
    text = text.replace(r"\Big|", r"\left|").replace(r"\big|", r"\left|")
    text = text.replace(r"\Big\rangle", r"\right\rangle").replace(r"\big\rangle", r"\right\rangle")
    text = text.replace(r"\Big\langle", r"\left\langle").replace(r"\big\langle", r"\left\langle")

    # 🟢 完美去括号逻辑 (v5)
    text = text.replace(r"\\[", "___TEMP_NEWLINE___")
    text = text.replace(r"\[", "").replace(r"\]", "")
    text = text.replace(r"\(", "").replace(r"\)", "")
    text = text.replace("___TEMP_NEWLINE___", r"\\[")

    # 修复 $
    text = text.replace(r"\$", "___TEMP_DOLLAR_SIGN___")
    text = text.replace("$$", "").replace("$", "")
    text = text.replace("___TEMP_DOLLAR_SIGN___", r"\$")

    # 中文文本包裹
    try:
        pattern = r'([\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]+)'
        text = re.sub(pattern, r'\\text{\1}', text)
    except:
        pass

    # 判断是否为多行公式
    if "\\begin{" in text or "\\end{" in text:
        return f"$${text.strip()}$$"

    lines = [line.strip() for line in text.split('\n') if line.strip()]
    if not lines:
        return ""

    wrapped_lines = [f"$${line}$$" for line in lines]
    return " \\\\ \n".join(wrapped_lines)

def main():
    """主服务循环"""
    if not os.path.exists(BASE_PATH):
        return
    try:
        if sys.stdout is None:
            sys.stdout = open(os.devnull, "w")
        if sys.stderr is None:
            sys.stderr = open(os.devnull, "w")
    except:
        pass

    logging.info(f"🚀 Service Started. Endpoint: {MODEL_NAME}")

    while True:
        if os.path.exists(PATH_IMG):
            try:
                # 提速读取
                img_ready = False
                for i in range(20):
                    try:
                        with open(PATH_IMG, "rb") as f:
                            f.read()
                        img_ready = True
                        break
                    except:
                        time.sleep(0.01)

                if not img_ready:
                    time.sleep(0.05)
                    continue

                logging.info("🖼️ Detecting...")
                start_t = time.time()

                result = call_llm_ocr(PATH_IMG)
                final_tex = clean_latex(result)

                try:
                    with open(PATH_RES, "w", encoding='gbk') as f:
                        f.write(final_tex)
                except UnicodeEncodeError:
                    with open(PATH_RES, "w", encoding='gbk', errors='ignore') as f:
                        f.write(final_tex)

                # 复制到剪贴板
                if pyperclip:
                    try:
                        pyperclip.copy(final_tex)
                    except:
                        pass

                cost = time.time() - start_t
                logging.info(f"✅ Done ({cost:.2f}s). Len: {len(final_tex)}")

            except Exception as e:
                logging.error(f"❌ Error: {e}")
                try:
                    with open(PATH_RES, "w", encoding='gbk', errors='ignore') as f:
                        f.write(r"$$\text{Error: Check Log}$$")
                except:
                    pass
            finally:
                try:
                    if os.path.exists(PATH_IMG):
                        time.sleep(0.01)
                        os.remove(PATH_IMG)
                except:
                    pass
        else:
            time.sleep(0.05)

if __name__ == "__main__":
    main()

使用方法

1. 获取 API Key

  1. 访问 智谱 AI 开放平台
  2. 注册账号并登录
  3. 在控制台获取 API Key

2. 配置环境

pip install openai pyperclip

3. 配置 AxMath

  1. 在 AxMath 中设置自定义 OCR 路径
  2. BASE_PATH 设置为 AxMath 的 OCR 目录
  3. 运行 Python 脚本启动服务

4. 使用流程

  1. 在 AxMath 中截图公式
  2. 脚本自动检测图片并调用 GLM-4V-Flash
  3. 识别结果自动写入 res.tex 并复制到剪贴板
  4. 在 AxMath 中粘贴即可

核心功能

智能清洗

错误处理

性能优化

总结

通过智谱 AI 的 GLM-4V-Flash 永久免费模型,我们可以为 AxMath 打造一个高效、准确的 OCR 服务。相比付费方案,这个方案具有以下优势:

特性付费 OCRGLM-4V-Flash
费用需付费永久免费
准确率较高
中文支持一般优秀
自定义有限完全可控

推荐给需要频繁输入数学公式的同学和研究人员使用!



Previous Post
ReClip 项目介绍 - 开源视频音频下载器
Next Post
硅与锗导带底等能面 3D 示意图