m3e向量化mysql某表

张

张建站

2026/5/29 22:05:25

10分钟阅读

一.在线请求m3e1.写个m3e-run.python# -*- coding: utf-8 -*- import os import time os.environ[HF_ENDPOINT] https://hf-mirror.com import pymysql import chromadb from chromadb.utils import embedding_functions # # MySQL 配置 # MYSQL_CONFIG { host: localhost, user: root, password: root, database: after260518, charset: utf8mb4 } print( * 70) print( 开始使用 m3e 模型进行 AI 向量化导入 ) print( * 70) # # 加载 m3e 模型 # start_time time.time() print(f\n[日志] 开始加载 m3e 中文语义模型...) try: embedding_func embedding_functions.SentenceTransformerEmbeddingFunction( model_namemoka-ai/m3e-base ) print(f[日志] ✅ m3e 模型加载成功耗时{round(time.time() - start_time, 2)}s) except Exception as e: print(f[日志] ❌ 模型加载失败{e}) exit() # # 连接 Chroma # print(\n[日志] 正在连接 Chroma 向量库...) client chromadb.PersistentClient(path./chroma_db) try: client.delete_collection(zm_work_records) print([日志] 已清空旧数据重新创建...) except: pass collection client.get_or_create_collection( namezm_work_records, embedding_functionembedding_func ) print([日志] ✅ Chroma 连接就绪) # # 连接 MySQL # print(\n[日志] 正在连接 MySQL...) try: db pymysql.connect(**MYSQL_CONFIG) cursor db.cursor(pymysql.cursors.DictCursor) print([日志] ✅ MySQL 连接成功) except Exception as e: print(f[日志] ❌ MySQL 连接失败{e}) exit() # # 读取数据联表查矿名 # print(\n[日志] 正在读取数据表 zm_summary_all...) cursor.execute(SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id a.mine_id ) rows cursor.fetchall() total len(rows) print(f[日志] ✅ 数据读取完成共 {total} 条记录) # # 开始导入 # print(\n[日志] 开始 AI 向量化导入...) success 0 fail 0 for index, row in enumerate(rows, 1): try: # 【修复】缩进统一空值安全处理 minename str(row.get(minename, )) doc_id str(row.get(id, )) title str(row.get(rem, )) desc str(row.get(description, )) process str(row.get(process, )) content str(row.get(content, )) # 拼接向量化文本 text f矿名{minename} 标题{title} 描述{desc} metadata { minename: minename, rem: title, description: desc, process: process, content: content } collection.add( ids[doc_id], documents[text], metadatas[metadata] ) print(f[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功) success 1 except Exception as e: print(f[日志] 第 {index} 条导入失败{str(e)}) fail 1 # # 最终汇总 # print(\n *70) print(f[日志] 导入全部完成) print(f[日志] 总记录{total}) print(f[日志] 成功{success}) print(f[日志] 失败{fail}) print(f[日志] 总耗时{round(time.time() - start_time, 2)}s) print(*70) db.close() print(\n[日志] 数据库连接已关闭)2.运行试试m3e-query.python# -*- coding: utf8 -*- import os import time os.environ[HF_ENDPOINT] https://hf-mirror.com import chromadb from chromadb.utils import embedding_functions print( * 70) print( m3e AI 语义检索系统带日志版) print( * 70) # # 加载模型 # start_time time.time() print(\n[日志] 加载 m3e 模型...) embedding_func embedding_functions.SentenceTransformerEmbeddingFunction( model_namemoka-ai/m3e-base ) print(f[日志] ✅ 模型加载完成耗时{round(time.time() - start_time, 2)}s) # # 连接向量库 # print(\n[日志] 连接 Chroma...) client chromadb.PersistentClient(path./chroma_db) collection client.get_collection( namezm_work_records, embedding_functionembedding_func ) print([日志] ✅ 连接成功可以开始查询) # # 你的问题 # question 视频 # 这里改问题 print(f\n[日志] 用户问题{question}) print([日志] 正在进行 AI 语义匹配...) # # 开始查询 # query_start time.time() results collection.query( query_texts[question], n_results3, include[metadatas, distances] ) print(f[日志] 查询完成耗时{round(time.time() - query_start, 4)}s) # # 输出结果带相似度分数 # print(\n *70) print( 匹配结果按相似度排序) print(*70) metadatas results[metadatas][0] distances results[distances][0] for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1): # 距离转相似度 similarity round(100 - (dist * 100), 2) print(f\n【结果 {idx}】| 相似度{similarity}%) print(问题标题, meta[rem]) print(排查思路, meta[process]) print(解决办法, meta[content]) print(\n[日志] 程序正常结束\n)以上代码问题关键字视频二.下载m3e并且调用本地离线版1.下载①下载huggingface‑clipip install -U huggingface_hub配置path C:\Users\Administrator\AppData\Local\Python\pythoncore-3.14-64\Scripts因为要用 huggingface_hub命令hf --version 查看版本1.16.4②下载m3e最小版 huggingface-cli download moka-ai/m3e-small --local-dir ./m3e-small 标准版 huggingface-cli download moka-ai/m3e-base --local-dir ./m3e-base 最大版 huggingface-cli download moka-ai/m3e-large --local-dir ./m3e-large提示Warning: huggingface-cli is deprecated and no longer works. Use hf instead.改成 hf download moka-ai/m3e-base --local-dir ./m3e-base2.执行查询 py_run_local.python 进行表数据向量化# -*- coding: utf-8 -*- import os import time # 不用联网直接本地加载 os.environ[HF_ENDPOINT] https://hf-mirror.com os.environ[TRANSFORMERS_OFFLINE] 1 os.environ[HF_HUB_OFFLINE] 1 import pymysql import chromadb from chromadb.utils import embedding_functions # # MySQL 配置 # MYSQL_CONFIG { host: localhost, user: root, password: root, database: after260518, charset: utf8mb4 } print( * 70) print( 开始使用本地 m3e-large 模型进行 AI 向量化导入 ) print( * 70) # # 加载本地 m3e-large 模型离线、秒加载 # start_time time.time() print(f\n[日志] 开始加载本地 m3e-large 模型...) try: embedding_func embedding_functions.SentenceTransformerEmbeddingFunction( model_name./m3e-large, # 本地最大版模型 local_files_onlyTrue # 强制离线 ) print(f[日志] ✅ m3e-large 模型加载成功耗时{round(time.time() - start_time, 2)}s) except Exception as e: print(f[日志] ❌ 模型加载失败{e}) exit() # # 连接 Chroma # print(\n[日志] 正在连接 Chroma 向量库...) client chromadb.PersistentClient(path./chroma_db) try: client.delete_collection(zm_work_records) print([日志] 已清空旧数据重新创建...) except: pass collection client.get_or_create_collection( namezm_work_records, embedding_functionembedding_func ) print([日志] ✅ Chroma 连接就绪) # # 连接 MySQL # print(\n[日志] 正在连接 MySQL...) try: db pymysql.connect(**MYSQL_CONFIG) cursor db.cursor(pymysql.cursors.DictCursor) print([日志] ✅ MySQL 连接成功) except Exception as e: print(f[日志] ❌ MySQL 连接失败{e}) exit() # # 读取数据联表查矿名 # print(\n[日志] 正在读取数据表 zm_summary_all...) cursor.execute(SELECT m.mine_name as minename, a.id, a.rem, a.description, a.process, a.content FROM zm_summary_all a left join zm_mine m on m.mine_id a.mine_id ) rows cursor.fetchall() total len(rows) print(f[日志] ✅ 数据读取完成共 {total} 条记录) # # 开始导入 # print(\n[日志] 开始 AI 向量化导入...) success 0 fail 0 for index, row in enumerate(rows, 1): try: minename str(row.get(minename, )) doc_id str(row.get(id, )) title str(row.get(rem, )) desc str(row.get(description, )) process str(row.get(process, )) content str(row.get(content, )) text f矿名{minename} 标题{title} 描述{desc} metadata { minename: minename, rem: title, description: desc, process: process, content: content } collection.add( ids[doc_id], documents[text], metadatas[metadata] ) print(f[日志] 第 {index}/{total} 条 | ID:{doc_id} 导入成功) success 1 except Exception as e: print(f[日志] 第 {index} 条导入失败{str(e)}) fail 1 # # 最终汇总 # print(\n *70) print(f[日志] 导入全部完成) print(f[日志] 总记录{total}) print(f[日志] 成功{success}) print(f[日志] 失败{fail}) print(f[日志] 总耗时{round(time.time() - start_time, 2)}s) print(*70) db.close() print(\n[日志] 数据库连接已关闭)3.执行问题查询# -*- coding: utf8 -*- import os import time # 强制离线模式不联网、不下载 os.environ[TRANSFORMERS_OFFLINE] 1 os.environ[HF_HUB_OFFLINE] 1 import chromadb from chromadb.utils import embedding_functions print( * 70) print( 本地离线 m3e-large AI 语义检索系统带日志) print( * 70) # # 加载本地离线 m3e-large 模型 # start_time time.time() print(\n[日志] 加载本地离线 m3e-large 模型...) embedding_func embedding_functions.SentenceTransformerEmbeddingFunction( model_name./m3e-large, # 本地最大模型 local_files_onlyTrue # 强制离线 ) print(f[日志] ✅ 模型加载完成耗时{round(time.time() - start_time, 2)}s) # # 连接向量库 # print(\n[日志] 连接 Chroma...) client chromadb.PersistentClient(path./chroma_db) collection client.get_collection( namezm_work_records, embedding_functionembedding_func ) print([日志] ✅ 连接成功可以开始查询) # # 你的问题 # question 视频 # 这里改问题 print(f\n[日志] 用户问题{question}) print([日志] 正在进行 AI 语义匹配...) # # 开始查询 # query_start time.time() results collection.query( query_texts[question], n_results3, include[metadatas, distances] ) print(f[日志] 查询完成耗时{round(time.time() - query_start, 4)}s) # # 输出结果带相似度分数 # print(\n *70) print( 匹配结果按相似度排序) print(*70) metadatas results[metadatas][0] distances results[distances][0] for idx, (meta, dist) in enumerate(zip(metadatas, distances), 1): similarity round(100 - (dist * 100), 2) print(f\n【结果 {idx}】| 相似度{similarity}%) print(问题标题, meta[rem]) print(排查思路, meta[process]) print(解决办法, meta[content]) print(\n[日志] 程序正常结束\n)

OBS RTSP服务器插件终极指南：实现零延迟本地直播的完整解决方案

OBS RTSP服务器插件终极指南：实现零延迟本地直播的完整解决方案【免费下载链接】obs-rtspserver RTSP server plugin for obs-studio 项目地址: https://gitcode.com/gh_mirrors/ob/obs-rtspserver 还在为OBS直播无法被监控摄像头、智能电视等设备直接访问而…...

2026/5/29 22:03:26 阅读更多 →

Cadence Allegro Quickplace放不全元件？别急，可能是这个原点位置在捣鬼

Cadence Allegro Quickplace元件放置不全？绘图原点位置深度解析与实战排查指南作为一名长期使用Cadence Allegro进行PCB设计的工程师，我清楚地记得第一次遇到Quickplace无法完整放置元件时的困惑。那是一个周五的深夜，面对即将交付的项目&…...

2026/5/29 22:00:33 阅读更多 →

终极暗黑2存档编辑器完全指南：5分钟掌握d2s-editor的核心功能

终极暗黑2存档编辑器完全指南：5分钟掌握d2s-editor的核心功能【免费下载链接】d2s-editor 项目地址: https://gitcode.com/gh_mirrors/d2/d2s-editor 暗黑破坏神2存档编辑器（d2s-editor）是一款专为暗黑2单机玩家设计的开源Web工具&a…...

2026/5/29 21:59:40 阅读更多 →