From 4e5bd2a5c731f2c731393d486afeec134559addc Mon Sep 17 00:00:00 2001 From: jesxion Date: Mon, 13 Apr 2026 11:34:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20VLM=20=E4=B8=BA=E9=98=BF?= =?UTF-8?q?=E9=87=8C=E4=BA=91=E7=99=BE=E7=82=BC=20Bailian?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 核心变更: - src/vlm/qwen_vl.py: 重写为 BailianVLMClient(阿里云百炼) - 支持 qwen-vl-latest 等模型 - analyze_chat_screenshot() 微信截图分析 - 使用 OpenAI 兼容 API 格式 - config/settings.py: 更新默认配置为阿里云百炼 - src/main.py: 添加 BailianLLMClient(阿里云百炼 LLM) - requirements.txt: 添加 openai 依赖 - README.md: 更新文档 API 配置: - 基础 URL: https://dashscope.aliyuncs.com/compatible-mode/v1 - VLM 模型: qwen-vl-latest(推荐) - LLM 模型: qwen-plus --- README.md | 118 +++++- config.example.yaml | 52 ++- requirements.txt | 7 +- src/__pycache__/main.cpython-311.pyc | Bin 0 -> 9986 bytes .../__pycache__/settings.cpython-311.pyc | Bin 0 -> 8910 bytes src/config/settings.py | 14 +- src/main.py | 29 +- src/vlm/qwen_vl.py | 341 ++++++++++++++---- 8 files changed, 456 insertions(+), 105 deletions(-) create mode 100644 src/__pycache__/main.cpython-311.pyc create mode 100644 src/config/__pycache__/settings.cpython-311.pyc diff --git a/README.md b/README.md index 9f5849b..2ef1af5 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,9 @@ ↓ UIAutomation + 屏幕截图 ↓ - Qwen-VL2 视觉理解 + 阿里云百炼 Qwen-VL 视觉理解 ↓ - LLM 推理判断 + LLM 推理判断(Qwen Plus) ↓ UIAutomation 执行操作 ↓ @@ -29,52 +29,86 @@ UIAutomation 执行操作 | 模块 | 说明 | |-----|------| -| `vlm` | 视觉模型接口(Qwen-VL2) | +| `vlm` | 视觉模型接口(阿里云百炼 Bailian) | | `wechat` | 微信客户端控制(UIAutomation) | | `core` | 核心引擎(消息捕获、回复判断) | | `agent` | AI Agent 逻辑 | -| `ui` | 桌面 UI 界面 | +| `ui` | 桌面 UI 界面(待实现) | | `config` | 配置管理 | ## 技术栈 - **语言**: Python 3.10+ -- **视觉模型**: Qwen-VL2(本地部署) +- **视觉模型**: 阿里云百炼 Qwen-VL 系列 +- **LLM**: 阿里云百炼 Qwen Plus 系列 - **Windows 控制**: UIAutomation (PyWinAuto) -- **LLM**: OpenAI 兼容 API -- **桌面 UI**: PyQt6 / Tkinter +- **API**: OpenAI 兼容格式(阿里云百炼) ## 快速开始 -### 环境要求 +### 1. 获取阿里云百炼 API Key -- Windows 10/11 -- Python 3.10+ -- 微信 Windows 客户端 3.8.x(推荐) -- Qwen-VL2 模型(本地部署) +1. 访问 [阿里云百炼控制台](https://bailian.console.aliyun.com/) +2. 开通大模型服务 +3. 创建 API Key: https://bailian.console.aliyun.com/cn-beijing#/APIKey -### 安装 +### 2. 安装依赖 ```bash pip install -r requirements.txt ``` -### 配置 +### 3. 配置 ```bash cp config.example.yaml config.yaml # 编辑 config.yaml 填入 API 配置 ``` -### 运行 +或在环境变量中设置: +```bash +export ALIBABA_CLOUD_API_KEY=your-api-key +# 或 +export DASHSCOPE_API_KEY=your-api-key +``` + +### 4. 运行 ```bash +# 模拟模式(不连接微信,用于测试) +python src/main.py --mock --demo + +# 实际运行(需要微信客户端运行) python src/main.py ``` +## 阿里云百炼模型 + +### VLM 视觉模型(用于截图识别) + +| 模型 | 说明 | 推荐场景 | +|-----|------|---------| +| `qwen-vl-latest` | 最新 VL 模型 | **推荐**,微信截图识别 | +| `qwen-vl2-7b` | Qwen-VL2 7B | 轻量级场景 | +| `qwen-vl2-72b` | Qwen-VL2 72B | 高精度场景 | +| `qwen2-vl-72b-instruct` | Qwen2-VL 72B | 最新一代 | + +### LLM 模型(用于生成回复) + +| 模型 | 说明 | 推荐场景 | +|-----|------|---------| +| `qwen-plus` | Qwen Plus | **推荐**,日常对话 | +| `qwen-max` | Qwen Max | 高质量回复 | +| `qwen-turbo` | Qwen Turbo | 快速响应 | + +更多模型: https://bailian.console.aliyun.com/cn-beijing#/model-market + ## MVP 功能 ### Phase 1(本期) +- [x] 项目初始化 +- [x] 阿里云百炼 VLM 集成 +- [x] 阿里云百炼 LLM 集成 - [ ] 微信窗口识别 - [ ] 聊天记录截图识别 - [ ] 用户信息识别 @@ -82,14 +116,66 @@ python src/main.py - [ ] 定时轮询机制 ### Phase 2(后续) -- [ ] 知识库集成 +- [ ] 知识库集成(OpenViking) - [ ] 多账号管理 - [ ] 复杂对话上下文 +## 配置示例 + +```yaml +vlm: + model_type: bailian + api_base: https://dashscope.aliyuncs.com/compatible-mode/v1 + api_key: "" # 从环境变量读取 + model_name: qwen-vl-latest + max_tokens: 2048 + temperature: 0.7 + +llm: + api_base: https://dashscope.aliyuncs.com/compatible-mode/v1 + api_key: "" + model_name: qwen-plus + max_tokens: 2048 + temperature: 0.7 + +wechat: + client_version: "3.8.x" + poll_interval: 2.0 + window_title: "微信" + +rules: + - keywords: ["你好", "hi"] + reply_type: keyword + reply_content: "您好,有什么可以帮您的?" + enabled: true +``` + +## API 参考 + +### 阿里云百炼 API + +- **基础 URL**: `https://dashscope.aliyuncs.com/compatible-mode/v1` +- **认证**: `Authorization: Bearer {API_KEY}` +- **格式**: OpenAI Chat Completions 兼容 + +### VLM 核心方法 + +```python +from src.vlm.qwen_vl import BailianVLMClient, analyze_wechat_screenshot + +# 方式1: 直接分析微信截图 +result = analyze_wechat_screenshot("wechat.png") + +# 方式2: 使用客户端 +client = BailianVLMClient(api_key="your-key") +result = client.analyze_chat_screenshot("wechat.png") +``` + ## 参考项目 - [thiflow-research](http://192.168.5.5:3000/jesxion/thiflow-research) - Thiflow 产品研究 - [thiflow.com](https://thiflow.com/) - 参考产品 +- [阿里云百炼](https://bailian.console.aliyun.com/) - VLM & LLM 提供商 ## License diff --git a/config.example.yaml b/config.example.yaml index 142338d..585269d 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -1,17 +1,36 @@ # WeChat Agent 配置文件示例 +# ============================================ +# VLM 视觉模型配置(阿里云百炼) +# ============================================ +# 阿里云百炼平台: https://bailian.console.aliyun.com/ +# API Key 获取: https://bailian.console.aliyun.com/cn-beijing#/APIKey +# +# 支持的模型: +# - qwen-vl-latest (推荐,VL 理解) +# - qwen-vl2-7b +# - qwen-vl2-72b +# - qwen2-vl-72b-instruct +# - qwen2.5-vl-72b-instruct +# - qwen-omni-series (全模态) +# ============================================ vlm: - model_type: qwen-vl2 # qwen-vl2 / gpt-4v - api_base: http://localhost:8000/v1 # VLM API 地址 - api_key: "" # VLM API Key(如果需要) - model_name: Qwen-VL2 # 模型名称 + model_type: bailian # bailian / qwen-vl / gpt-4v + # 阿里云百炼 API (OpenAI 兼容格式) + api_base: https://dashscope.aliyuncs.com/compatible-mode/v1 + # API Key: 设置环境变量 ALIBABA_CLOUD_API_KEY 或 DASHSCOPE_API_KEY + # 或直接在这里填写: + # api_key: your-api-key-here + api_key: "" + model_name: qwen-vl-latest # 模型名称 max_tokens: 2048 temperature: 0.7 llm: - api_base: https://api.openai.com/v1 # LLM API 地址 - api_key: your-api-key-here # OpenAI API Key - model_name: gpt-4o # 模型名称 + api_base: https://dashscope.aliyuncs.com/compatible-mode/v1 + # LLM API Key (同上,可以使用相同的 API Key) + api_key: "" + model_name: qwen-plus # 或 qwen-max, qwen-turbo 等 max_tokens: 2048 temperature: 0.7 @@ -32,15 +51,30 @@ rules: reply_content: "您好,有什么可以帮您的?" enabled: true - # AI 回复示例(无匹配关键词时) + # AI 回复示例(无匹配关键词时,使用 LLM 生成回复) - keywords: [] reply_type: AI reply_content: "" enabled: true -# 知识库(可选,后续接入) +# 知识库(可选,后续接入 OpenViking) knowledge_base: url: http://192.168.5.5:1933 # 日志级别 log_level: INFO + +# ============================================ +# 环境变量 +# ============================================ +# 推荐将敏感信息放在环境变量中: +# +# Linux/macOS: +# export ALIBABA_CLOUD_API_KEY=your-api-key +# export DASHSCOPE_API_KEY=your-api-key +# +# Windows: +# set ALIBABA_CLOUD_API_KEY=your-api-key +# +# 或使用 .env 文件 (需要 python-dotenv): +# ALIBABA_CLOUD_API_KEY=your-api-key diff --git a/requirements.txt b/requirements.txt index 47269ff..ffc3ec6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,14 +8,13 @@ pywin32>=305 requests>=2.28.0 urllib3>=1.26.0 +# 阿里云百炼 / OpenAI 兼容 API +openai>=1.0.0 + # 数据处理 pyyaml>=6.0 pillow>=9.0.0 -# 异步(可选) -# asyncio -# aiohttp>=3.8.0 - # 日志 # coloredlogs>=15.0 # 可选 diff --git a/src/__pycache__/main.cpython-311.pyc b/src/__pycache__/main.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea78181c88229ee0eefcc141db776f7428dbb89b GIT binary patch literal 9986 zcmb6b!)n%z>j)VeIm_XQT1OAs47#z3$kF&F|dHVK5xV3O4;>c+^(lIU&)FmhtD zWDyPtPL>2RaVAO@7|-AY=GerOaO}*Ft<=u`=%#uLRo7BQ)gohlji@kHRQ~LK@3nRL zAd}ZmU%!6udq;ooJ74QpHk+A%RK)&||6wUX{4aV)CuXYfy@Mo(vjj_Y6D-N<0%SLd z@49XszU#a7_)c|GaMuUuK0~*GBsg8KF+Ef~JyyB`-&s;ea3=T}IP*oA;}ZO|)N7ep z%E!2<1AO>ti9{=-0@glTw+)x+KtZ3q+m6eIfTPdZ?Zjncps=r~yQr_YyBMBrtcf%2 z)3N5`bax4BIZkw!a%HUb03MgO!pcf7!UUJn(-6={w+sK=1(9;7Hvf#&=u!8XBjPm@dFQO2kp$87ZG6`5VEKW4inrAenv_;ie*17-;g#>-VmGIdS$v z^2XWzRg0H-1)u2egZSIK$q;{*W+=N8;L9&xN{*camaykeJ|QjLL7 zPme#?qgr0|3VxqPXw}%miC$6URXP$5BZ}jG0(xuL4@+0>7C2s5*~L!1wG zu?}VkiYo=)x3bUc57vi|s5XHUBVo_f416K;VCiC{x5A+Ao3MG%V!zIK3y{?*yKY>BOo(zET_9^znT%o{4y5zi7aXRHwH;{4+bTa(Xg0|p%tHAMCK>l-kFoXk(p&o7$u zy|$HC9jLt^{nWcKT^d`EKjr`vW#T!UpT?U$zECzN^%mxj z2qp~ZHRn6kPRGW3wKY5MkL=Z=pT1XOM|Sim6Q!cYsD5m5zJ?z##hm*{cFA#Oz>G^v zp=80OWl*w4P4Ok!QL-i09X03gIq0RiaF<6de>d)msC5P|*_)A4{?eQjequU>fmiDS z%lg~{$lF7FyL_Mk_|g_Fh%e8+Nw$=4JJ@=5CELI@vQM&2BUBH`t^%FfoNwm_>`_zi ziu4HaC(@!7 zb>`mlsYeo~49!ghP<&-BZo14qHGT9`rJ%w7G2M#`T0cb>wA|jw{4p|@tfq(X!6vD z;H_I&PVn*mum~<>KT}ud!!CZmuFltQ(aQBlynTTPGI;mjdIx#FAD+1T?N^guUrt^> z4z9hC<@R|a0a2y)bAfQbrLGPf%{uJ+!YBbAc{t$U((Ri!?tT_Osm7nR?d;mt^{h&7 z-nnzfPV7}{{`9?XewB<}a_f0y$yB3nf5`9S1RhzienVYd9|R7nSr9`!=Mnh`7eRt@ z_u7qnXGSMRKbv^_(}{PX|5wS6M*n&>c6a=p$s50%c=v+#{(rrC5>{*Qg0mh9W=3SW zzK}Nm)id|be*>c=ubz$|uFrj07%KV2x#S28dny_GIQi{2fP44)+Zs-Nef^7w5MeJ5 zuD75v++n{UdO`;xXx+J($*+H>bzkR-EP}Sl%U|ES`r+jD_i`so4!?W<#5)roUYWcx zJo(wF2r~LwdvYi~dF|RnY&bde={lF&pql$aEEn*IN5Y)Sc*A}V1P`2QM5P1Vk#sW( zL7?nC>=8o;xS*g~M6M6Q39lI8xd^iOFKzF1W%xr|p*BmTYWCLT`QJ|d;ler>kNkBX z{4y!+z<<9Csy@)2B{%{?Z32ba-TD{_%3ODx>84@?l=M)dSqdU6`iJ>AT4@c0eBOYt zp?;&+AMksF+qZXa!f|C}9t?N?+>QGuPTl?Hy?Y;BzxU=<7qn)V?Mu&$mSTV(a)7^o z2;c+qEDDatSb+)(d+PZ7g zj^{ULstAZk7s8vU3}PTpgb%n)szDQf)vzPX1>3f%`aqyh!_tKW)yM^3^Yfvgs)rbr zM=lL?(2fA~DkX4%eLRvP)ewT-UcZ2->2eM7PCSjr;}7~pkEeg>C4$0k==bwpFd(&%9p1IqM}Xr#j9hzmxI3vO6y;g>vt>lyS3{7-m_QP zX(@X!k!7ga4dvY8=xo36Grh=MWmwDL~Eim0(s# z%!(96n+j6IETF(@0%|-CU^T&SgJB~`AhKZ0)JW0l(G&8YCr)3QA$8Z)6&l$tZ+j5_B zwNntz$3e4D`36>t5WwI?&|%ZWnXi(uD-&PECXQWp?d{jUvd(q?{~-12yH>Wj!8G}U z`$DP-i4^8DEUMtWF+$KA7WRik%tx4o($>lx3VNnQzrQ?RRI@Enm|NHa02wHz;Iw7P zA{7hqpDT;kz#Zk?uPD3sNv#LRw<@ij-@U4|z92yk9Mqa+MpPJ4N@vaDWM~BAW?Y&$ zJYbh5$KLSEfY!O#fTJZ$>kMsKp!Hk2tiA`@>pz)x4l)&Sk+a(?(C#_*XjVP1xgU5p zk{|yvdH$WW$9f-WfPOvr%Q=l1=A%`k5b=S#FYwO;9*+)HRO2BpA4KQk2GGz#D2QBm zP^%Dc%-kmic7x}_fg}B;Gj<8rzX2l($O}Y!WO1I_0LR&;^$Ba?>5icesl08pbZq{( ze!Os8uee*`S!%wV)-^S30VQS-xF8Gm7;_MXKF+8{zu>`6DRKqev}!`N@Oph={spu= zY_RPR33Cu2ffn$D;ggqY-0u}ULGBP{*L!C^MU0F&8saxQbc49?{>b3u#ba7+1Vycy zGIr(0;K1J z2uyREolEEmRGfvEh+5#!gj`^NFf;XC#PrXESoBc;x1)I*2nU z6soIs_^?&)gZRIm3&Qs@jz{9mdl8Z;%!iQn2my&;3nKwm(*|k*d=W#@E;sdz1cDbq zE&?1ER-GN(kyk@rp4|pM9nVL?aFPKh3}MyG`UNn9Xie@SOdmvj3Q-z)kLXJ%<`T_o z+>X|<2?1pPdE{I0C|*%%{6UV-mxS!H`JEUE>!%zS1}O{VCg!DQp3%Z53JEG71C(m` z37^AuJOY(=+g7637DqC8|uO_A1m~ ziQ1cOwZ+9NFC}W)C7cu0%OMh?%O4Or`iX}KfEx(FK+u+{;h52CsdYad%&$;>iSlO$ zgSPQ5+|;2^9TL@X$Kt$YslI8cmM!xY%lyIizrjd_!;8+;%hUpeS|CviZrh3#+x$-s z%eG~TZQ0-!z$hvk7S9}&3u~3a+QE*5rSz6%-c8Fq*;1oeYEp!Owr33OCYfnhn0AS2 zPcTIaQ+bPV-DF&&#p9I3xMXI%!mO8=^|zT4g;{u^DSlM0ZBlBRWM-AZtdf{jnQ>Z1 z4@kwJ&{|}sRbg5srZv;FPG(vZrbS{}pekla+A0-W%>{kDLv}TQ2QAwg6ZW=9Ggl*)=y~rOY%aOoN2;quZ8Zv|IKpK<{83vPvbZuQURH zEHi5qW{t$G!S50?XxEgtPTP^!ZiXEi>_k_JDy6~m{Za4$p*+muH4udhCFg?p64~ii zobJKrfPWb4@gv|w(?uD&u92Cw3bR&X*4}n5mYqu#=Tfwq6vdc|ZkH^+vic9}m8MRq zq)RU8QcAjFTM|VTXE(gRK`yFQifU1KQkIEKel9!qD2_dnV-Hjf8#0!?S+=fLtg9uQ zAKkWBK{r!T79hm!L))c_)^R5Q$g*|4VqGs;k#`W=lD?)0WGaHf-^_%q9JVY`G!JEa zNyL=0+q_Gh|5T%#} zU3ZFVhFcS*m1q6$``dJYbQFHnN;HERymIUPD;E6?}CfY^*{mLGjc zz_ts!BflmzM-d$h{=l?q@%{$>U*HjWOhPq4*c6gX>2xFo5(|JKNb{fxT?xWGh_3`u zFo>@NVIRa-f+&~rGw=sV9cND1-?R^t!%c9B!{WKE@4bHJb*1bHrL0LVTBQ`Nl8I)8 zXqJfPJ4Bt7pA-3y(3&8gl=5?eSTD_%QwExZzs~?T2Z|?M&M7%|=}6$X92lSo${Opx dsjraq6$y$NY(KW;_?AI@rSt?<3DuZ-{|_eo*=Ya( literal 0 HcmV?d00001 diff --git a/src/config/__pycache__/settings.cpython-311.pyc b/src/config/__pycache__/settings.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d07a0651db6ccf66488b03f72eadf1713cc3ddb6 GIT binary patch literal 8910 zcmcIJZEzDumNU}m`@>)Im;7ymV?k_VjEU_8SUwE_OcKcEqD@XEG-F$2EIBhH3_kl- zyG3@DjdKxuWN}z0IJw>66glGiakaT!NOtq#pl#G1w3D)hcZ@oNPMYEv6E#4w)-x1q`_M#D zm*KCc58-ursXzr?IZ8WFIvz>s&QUsn()CD6PmaG5 z>4L?q=_nN_O>BWlIlq4G&D+X<`OQPS*3#;|di56~L(OjsI; z%i;2ge_dAzMQk_VzoU-TwL2Z?3+s*^Y;!d^8->oUjBOACgXwao=@~NYa?t-`UBA z#St+QALBa0eDri877;rl@zGBBkHHci=ec%l)p@c@vxmo`q2pnZ(x_cdjwD5waZ3Q6%-xR`i*h@@LPG?&DUj&Z=Fguo33yqY}} z!WN;B<_v{&MnT>k3cZ>L^Er$y6k_9%P)KmW+%>Z(30hGo6pqE>65+K7C=Z~7A_T<< zN(p5&CJ8>wS%&y{So(;%2L5`c)wW1;UblNO44e%&TgX$Yd?oUNzW&j?A8Boj0 zlLM)vGE)wZ!j!8OtvY?H%(N;@t6JgL%YK>hD~w;Qu1y|H192@p3W1Oj>X(MHwr~ak z{eKirCkWt@aJo46vT%YpdNkLe0BuR*!-5n?E~alnK@Ud$nvCLBb1#`uJ&GpqzL z$Tgu10dj3H;1SBP04kIcDiKs6AU#!M&X2&zbfE_8u(?o+pbi09F&wGTv;?px5%;l? zjRBbnd_UWobT&1~Oq0Si{cqTY-(Q<;hXC^f*k%W2Ea?;nhovQ*;w;oDz$lmIeSzBz zf}K|#xR{HdUAg)B`-@jTxRv_D?aQC&Z2QjA-O<}|O7lkeDC~jIN!T5*F>V7rrXOjZ zu{h6%U^{Wb$uOU*&+gF5#E8Jb{vL@-dF;1VXrg~cbB{-3YbA@bc@V1!lMuf8C`bpTWhn82h2*gnZPP@mXA#zf`z{fe?0}Q zoQ<>NJoUsm@}|NGjCADL>LR5jwxT(3EbT_ZR>yPP7=QXXFlhng3fKcN2Lf(M&7)U<-jjg;$21!k z3xfe*K|`;}G6owlJeW*j6J!N+Y6@7m8H?Qr&{k{A@pznH$-{O;BR~d=Q2<6BR>(|+ z>Tj4?zre`;9>w32@%Ml>Wz*f$QJDcg_9#rxDveuR15ECgnHqQ$rbhKufyO;1Gga^? z1VTpVDSmrxp2~pv!92BaHl2@z>-JS#$4zQu88qjF!numV#W{h3Y-&Q`UPa;Iyj)R% zNWrz3J>%{fR!XkMFN{FkJ@`-Kh|n`S=Ak7{Nf zfK|7LnneH+69=`@S7PyT5Qbrn_yD1Vz-!Eb!F@+G2Ol2}@!Uy{5BMG`C%WnEgKy~i znyBD@EaI*boYsgS#Z8|ak6S=Za zsqD*C_9b_xKsfr~QJ6mUk;uB{ndcX_%XPbyx?P#NUCDi^L7CYFkHYLy337{E*Q?a^ zX6kyAdsAD|j_DmT(+iKn^r~%-g9Ojm=gMY1G6VU?73OiZye4@dCFbQ*&*<`q^DxLG z&K=Xes{nqvN+QwWgg>~PoUK3t?7B3js3C|N&Y-L5WIe2fbFn}-#IUwAmKBkK8z}6Y zCx>^OVOHS1fXB8b*0~B-1UMJx0-Splt~f=VqJt$MdtS|>%idw)lnV*8*}CxQ9{l1v z^Mjf#r2AcwyvhVBbMAi+746&fd1{h60XaP-TP4Kjl5yB``9Nq1kMBv!sFWwEP##^_ z^HJ&}dN5$t9AMT&DIAM%nng^2;|ik;#klcM2x10vgcpG#L_&#BNW?ETCI#YX3_yOO z+%W5oT+6+H5;p*NkfcGw@A6AP>r>A7Q z?(=CbeSJdUo{wl&fdd~p7Ac%2c->^d?%jbUG%nT8Y{uzL(#dC~g0?5}`MH86h5mAh zHA(0K=aPh;_);GANiG=|_k?Y!eytp9o`7DAEsPQ06m1wG&W@zHf{`rM0c&YvU&4zL zIy|{F3A=%{d~E#?W_LM1!yanNB%x=SODFPknOtGLr_2zA!_PQEnu@tvs4hxm#!aWp zFH++)O}$9b@SCLOJWIV6z9-GbQ5FhdtV}`$mrF)*E~GPZGGNo)fM3nsxpqz8RD)XC z3ww9(-+3&wb71e_vC!@#gZmB)OyuOJ<8*kG*J$B8OF|lI?**U1&cz~e7Q)Lwk+2Is zubJZDEpV}uQ6V1F%)^|d>tN(3Mr=b-9wRpESXcmeL1T1*D1B|dAdK8|GtvWk+ZZP8u zWPF=4z9&9yS9}9X@%~Kl{<{`)jrTqUAa(FtC*}4|Kk?S~Uu~b+sJL1)uGTDNakN}7 zDNnyL*S%0Fm-HzmeVLNJEJb?)pFVrNvgX|1&xMuBwoGMPn#r1>mY@i@i$wPkEZ41- zd+^}FTGcoFbzk{6e(B#hzen+RW&B;B7Y?QlF8UkaX`5}s!VORoe<$1hB8E?2l$Gz= zV*X+av#Zzk#gjBZ;cubZpeCG9fm`6-uK}K?qyl|w6uK!mikhTPQo=#lKJ>DYnJZ`k z{p6Cd?H=moO)r5`nKDl?($b+9P)?do&+`dA5;s(btYc(Q&=SN7YVq!ut=gVa?FB0@v)DX0)pj;%xiS%H8c41 z0`ai7Ln}CUL1$F6MB%Uw48cfP3`>$gKCju}RE(6C$3finWB7F(k!B5#fdya%;=p64 z0h_MpQ4wiY-HFl65R7UTI88`!ngb(^5NdyoAwdTk8ngkBba(0{sF0HU<&Y}L7{l=X(IIK3yeEwxRpZ=UO!edVkA&M)gbmHMtseOHz;dA1O6)|ob^ zpTAz_pV>OwKPM_>8#84aK^K>7x$di(**E+2ym|h4#n+wjb<4i)2meo zwRbl5RsH5K>o+U)TQc=zm`~tXpC}ybw)qyNtRqv_fn$9_AM4@y?gfkD+nVuhm3>=b ztfgr-wIj7dt!tWlZ1&|WW%FzxAnmwb*{IgFUVQn&%k%vUf>PI?sq0r8+AdCAn3{j- z(>|qPZ>C|d+Pv{n`v>g{MSm_;n)hUy_o%h&FCM;dcs{UjOsVb9)b?i`ma5Xb6oC5( z!2Y?ZlFD=LuZml~EN+?GqZ9`+#Q~W-H(WI{Fv}KrFg|MEmTBAmRokvF+jjlAM`=5d zX*-bGmvOJZQPXg-_d@U7!G#K?rZ-d5o3du?RX2*tQyvm486pI9T!}T?2N|Q&oAiMj z_5*CyQA1P#JCN3Q;6MCf@vYx1Ui-(z*Wc9lVZcQEBLSr?KohAJvHzU2;sj4A=ip3K z8HIHja|YN^|f81cy$#8Ymh(DwL!`h?N+HN#r z0!@S)H$Q)O@tl({X z_Gj^~;>U+J3*?*`kC+2ia-Fc&#Q`}!o~T~BCxHu(4)kfmQF1?WvTMS9I-U^Pp}rk2 zLaf7MQu~&8!1B<(gmv&ip%pn*Yn>)I;_IC7i^z?W2b$1Iy+gsPPE61TBF$o$yE*!zaGXVBM*y{kq08Er2a41uOtB)zqRk zx2o$OGu9V*D!kcxs-!}$+>|w&8oUsh_#3lF=(aLvmI9EneS3_iyrt=NGv>Kr+0&tT zIx-&A-;QVKg{JH7;^_nF;h9$#n13&q-A5Gnk&OEYWFVvf0-a2U5LG}8gcU353u#v8 z^%&!On~1neKC9lj&it2k%+5C3UpCSJg*{LkC|&cQZ!e&9)rjIt+z0)%ZvcvLaDkVt zqntDnXZ1)D6(|TsG$ILT+JzW`Q&{QAou@=V?BSG!77joyQL;bMU4flQiU3^;u?z&@ zLE)*N%2dlt^^Ia*@_^d74o-jnCOwq#)a8%)vy9nNm!;MKPRd%6HI-UwvJ`+Bk-X-I z$ZHwgH3Qk1pXh{bq)D@-V$PSP;584CBD|JCo*HZS(}bLX&bMYMcrAlG9oD)DS-a&K zdd{4s@LdS=q$?4HM7rg`RSJ~ow`R}bvXTAh%3ptTqJVZpU;t=VxE+Qg0R03Oxv1GN zuIHo21$4*sW85apwICo)AG(Qz7b{L?zq3E3-{o!>ehMVOBTD=PAjmOIXH90B0S*9g zQnWMaz@JLFlH{jSZSq==N;S!AJt|ct8z0zoCRf&Eqk+w VLMResponse: - """发送对话请求""" - pass + 阿里云百炼平台的视觉模型客户端,支持 qwen-vl 系列。 + 使用 OpenAI Chat Completions 兼容格式。 - @abstractmethod - def analyze_image(self, image_path: str, prompt: str, **kwargs) -> str: - """分析图片""" - pass - - -class QwenVL2Client(BaseVLM): - """Qwen-VL2 客户端""" + 使用方式: + client = BailianVLMClient(api_key="your-api-key") + + # 方式1: 文本对话 + result = client.chat([VLMMessages(role="user", content="你好")]) + + # 方式2: 图文对话 + result = client.analyze_image( + image_path="screenshot.png", + prompt="描述这张图片的内容" + ) + + # 方式3: 分析微信截图 + chat_info = client.analyze_chat_screenshot("wechat.png") + """ + # 系统提示词 - 微信 UI 识别专家 SYSTEM_PROMPT = """你是一个专业的 Windows 微信客户端 UI 识别助手。 你的任务是根据截图准确识别微信界面中的元素。 @@ -63,15 +80,42 @@ class QwenVL2Client(BaseVLM): def __init__( self, - api_base: str = "http://localhost:8000/v1", - api_key: str = "", - model_name: str = "Qwen-VL2", + api_key: str = None, + model_name: str = "qwen-vl-latest", + base_url: str = DASHSCOPE_BASE_URL, max_tokens: int = 2048, temperature: float = 0.7 ): - self.api_base = api_base.rstrip("/") + """ + 初始化阿里云百炼 VLM 客户端 + + Args: + api_key: 阿里云百炼 API Key + 可从环境变量 ALIBABA_CLOUD_API_KEY 或 DASHSCOPE_API_KEY 获取 + 或从 https://bailian.console.aliyun.com/ 获取 + model_name: 模型名称 + - qwen-vl-latest (推荐,VL 理解) + - qwen-vl2-7b + - qwen-vl2-72b + - qwen2-vl-72b-instruct + - qwen2.5-vl-72b-instruct + - qwen-omni-series (全模态) + base_url: API 基础地址(OpenAI 兼容格式) + max_tokens: 最大生成 token 数 + temperature: 生成温度 + """ + import os + + # 获取 API Key + if not api_key: + api_key = os.environ.get("ALIBABA_CLOUD_API_KEY") or os.environ.get("DASHSCOPE_API_KEY", "") + + if not api_key: + logger.warning("未提供阿里云百炼 API Key,请设置 ALIBABA_CLOUD_API_KEY 环境变量") + self.api_key = api_key self.model_name = model_name + self.base_url = base_url.rstrip("/") self.max_tokens = max_tokens self.temperature = temperature @@ -80,10 +124,57 @@ class QwenVL2Client(BaseVLM): with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") - def chat(self, messages: List[VLMMessages], **kwargs) -> VLMResponse: - """发送对话请求""" + def _get_image_media_type(self, image_path: str) -> str: + """根据文件扩展名获取 media type""" + ext = Path(image_path).suffix.lower() + mime_types = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", + } + return mime_types.get(ext, "image/jpeg") + + def chat( + self, + messages: List[VLMMessages], + system_prompt: str = None, + **kwargs + ) -> VLMResponse: + """发送对话请求 + + Args: + messages: 消息列表 + system_prompt: 系统提示词(可选) + **kwargs: 其他参数(max_tokens, temperature 等) + + Returns: + VLMResponse 对象,包含 text 属性(解析后的文本)和 raw 属性(原始响应) + + 示例: + # 纯文本对话 + client.chat([VLMMessages(role="user", content="你好")]) + + # 图文对话 + client.chat([ + VLMMessages(role="user", content=[ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, + {"type": "text", "text": "描述图片内容"} + ]) + ]) + """ # 构造消息格式 formatted_messages = [] + + # 添加系统提示 + if system_prompt: + formatted_messages.append({ + "role": "system", + "content": system_prompt + }) + for msg in messages: if isinstance(msg.content, str): formatted_messages.append({ @@ -97,13 +188,14 @@ class QwenVL2Client(BaseVLM): "content": msg.content }) - # 添加系统提示 - if not any(m.role == "system" for m in messages): + # 如果没有系统提示,使用默认的 + if not system_prompt and not any(m.get("role") == "system" for m in formatted_messages): formatted_messages.insert(0, { "role": "system", "content": self.SYSTEM_PROMPT }) + # 构建请求 payload = { "model": self.model_name, "messages": formatted_messages, @@ -112,14 +204,13 @@ class QwenVL2Client(BaseVLM): } headers = { - "Content-Type": "application/json" + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", } - if self.api_key: - headers["Authorization"] = f"Bearer {self.api_key}" try: resp = requests.post( - f"{self.api_base}/chat/completions", + f"{self.base_url}/chat/completions", headers=headers, json=payload, timeout=60 @@ -127,6 +218,8 @@ class QwenVL2Client(BaseVLM): resp.raise_for_status() data = resp.json() + logger.debug(f"VLM API 响应: {json.dumps(data, ensure_ascii=False)[:500]}") + return VLMResponse( text=data["choices"][0]["message"]["content"], raw=data @@ -136,18 +229,35 @@ class QwenVL2Client(BaseVLM): raise VLMError(f"VLM 请求失败: {e}") def analyze_image(self, image_path: str, prompt: str, **kwargs) -> str: - """分析图片""" + """分析单张图片 + + Args: + image_path: 图片路径(本地路径或 URL) + prompt: 分析提示词 + **kwargs: 其他参数 + + Returns: + 分析结果文本 + + 示例: + result = client.analyze_image( + "screenshot.png", + "识别图片中的所有文字内容" + ) + """ if not Path(image_path).exists(): raise VLMError(f"图片不存在: {image_path}") - # 构造多模态消息 + # 编码图片为 base64 image_data = self._encode_image(image_path) + media_type = self._get_image_media_type(image_path) + # 构造多模态消息 content = [ { "type": "image_url", "image_url": { - "url": f"data:image/jpeg;base64,{image_data}" + "url": f"data:{media_type};base64,{image_data}" } }, { @@ -162,28 +272,43 @@ class QwenVL2Client(BaseVLM): return response.text def analyze_chat_screenshot(self, screenshot_path: str) -> Dict[str, Any]: - """分析聊天窗口截图 + """分析微信聊天窗口截图 + + 这是最常用的功能 - 分析微信截图,提取聊天信息。 + + Args: + screenshot_path: 截图文件路径 Returns: - 解析后的聊天信息,包含: - - messages: 消息列表 - - current_chat: 当前聊天对象 + 解析后的聊天信息字典,包含: + - current_chat: 当前聊天对象名称 - has_new_message: 是否有新消息 + - messages: 消息列表,每条消息包含: + - sender: 发送者 + - content: 消息内容 + - time: 时间 + - is_self: 是否是自己发送的 + + 示例: + result = client.analyze_chat_screenshot("wechat_chat.png") + if result.get("has_new_message"): + for msg in result["messages"]: + print(f"{msg['sender']}: {msg['content']}") """ - prompt = """请分析这个微信聊天截图,返回 JSON 格式: + prompt = """请分析这个微信聊天截图,返回严格的 JSON 格式,不要包含其他内容: { - "current_chat": "当前聊天对象名称", - "has_new_message": true/false, + "current_chat": "当前聊天对象名称(如果是群聊,返回群名)", + "has_new_message": true或false(根据是否有未读标记判断), "messages": [ { - "sender": "发送者", - "content": "消息内容", - "time": "时间", - "is_self": true/false + "sender": "发送者昵称", + "content": "消息内容(图片用[图片]表示,语音用[语音]表示,视频用[视频]表示,文件用[文件]表示)", + "time": "时间字符串,如10:30", + "is_self": true或false(是否是自己发送的消息) } ] } -只返回 JSON,不要其他内容。""" +只返回 JSON,不要其他文字。""" result = self.analyze_image(screenshot_path, prompt) @@ -194,25 +319,33 @@ class QwenVL2Client(BaseVLM): end = result.rfind("}") + 1 if start >= 0 and end > start: json_str = result[start:end] - return json.loads(json_str) + parsed = json.loads(json_str) + logger.info(f"聊天截图解析成功: {parsed.get('current_chat', 'unknown')}, {len(parsed.get('messages', []))} 条消息") + return parsed else: + logger.warning(f"无法从响应中提取 JSON: {result[:200]}") return {"raw": result} - except json.JSONDecodeError: + except json.JSONDecodeError as e: + logger.warning(f"JSON 解析失败: {e}, 原始响应: {result[:200]}") return {"raw": result} def detect_ui_elements(self, screenshot_path: str) -> Dict[str, Any]: - """检测 UI 元素位置 + """检测 UI 元素位置和类型 + + Args: + screenshot_path: 截图文件路径 Returns: - UI 元素字典,包含类型和位置 + UI 元素字典,包含 elements 列表 """ - prompt = """请分析这个微信界面截图,标注关键 UI 元素的位置: + prompt = """请分析这个微信界面截图,标注关键 UI 元素的位置和类型,返回 JSON 格式: { "elements": [ { - "type": "button/input/chat_list/...", - "name": "元素名称", - "bounds": {"x": 0, "y": 0, "width": 100, "height": 50} + "type": "元素类型(button/input/chat_list/message_area/sidebar/title_bar等)", + "name": "元素名称或描述", + "bounds": {"x": 0, "y": 0, "width": 100, "height": 50}, + "clickable": true或false } ] } @@ -228,6 +361,55 @@ class QwenVL2Client(BaseVLM): return {"raw": result} except json.JSONDecodeError: return {"raw": result} + + def recognize_text(self, screenshot_path: str) -> str: + """识别图片中的所有文字 + + Args: + screenshot_path: 截图文件路径 + + Returns: + 识别出的所有文字内容 + """ + prompt = """请识别图片中的所有文字内容,按原顺序输出,保持格式。""" + return self.analyze_image(screenshot_path, prompt) + + def check_for_new_messages(self, screenshot_path: str) -> bool: + """快速检查是否有新消息 + + Args: + screenshot_path: 截图文件路径 + + Returns: + True 如果有未读消息红色标记,False 否则 + """ + prompt = """快速判断:这张微信截图中是否有未读消息的红点或数字标记? +只返回 true 或 false。""" + result = self.analyze_image(screenshot_path, prompt).strip().lower() + return "true" in result and "false" not in result + + +class QwenVL2Client(BailianVLMClient): + """Qwen-VL2 客户端(向后兼容) + + 兼容旧的接口,内部使用 BailianVLMClient + """ + + def __init__( + self, + api_base: str = DASHSCOPE_BASE_URL, + api_key: str = "", + model_name: str = "qwen-vl-latest", + max_tokens: int = 2048, + temperature: float = 0.7 + ): + super().__init__( + api_key=api_key, + model_name=model_name, + base_url=api_base, + max_tokens=max_tokens, + temperature=temperature + ) class VLMError(Exception): @@ -236,20 +418,51 @@ class VLMError(Exception): # 工厂函数 -def create_vlm_client(config: dict) -> BaseVLM: - """创建 VLM 客户端""" - model_type = config.get("model_type", "qwen-vl2").lower() +def create_vlm_client(config: dict) -> BailianVLMClient: + """创建 VLM 客户端 - if model_type == "qwen-vl2": - return QwenVL2Client( - api_base=config.get("api_base", "http://localhost:8000/v1"), + Args: + config: 配置字典,包含: + - model_type: 模型类型(bailian / qwen-vl) + - api_key: API Key(可选,从环境变量读取) + - model_name: 模型名称 + - api_base: API 基础地址 + + Returns: + BailianVLMClient 实例 + """ + model_type = config.get("model_type", "bailian").lower() + + if model_type in ("bailian", "qwen-vl", "qwen", "aliyun"): + return BailianVLMClient( api_key=config.get("api_key", ""), - model_name=config.get("model_name", "Qwen-VL2"), + model_name=config.get("model_name", "qwen-vl-latest"), + base_url=config.get("api_base", DASHSCOPE_BASE_URL), max_tokens=config.get("max_tokens", 2048), temperature=config.get("temperature", 0.7) ) elif model_type == "gpt-4v": - # GPT-4V 客户端(待实现) - raise NotImplementedError("GPT-4V 客户端待实现") + raise NotImplementedError("GPT-4V 客户端请使用 OpenAI 兼容接口") else: raise ValueError(f"不支持的 VLM 类型: {model_type}") + + +# 直接使用函数 +def analyze_wechat_screenshot(screenshot_path: str, api_key: str = None) -> Dict[str, Any]: + """快捷函数:分析微信截图 + + 这是一个便捷函数,可以直接分析微信截图。 + + Args: + screenshot_path: 截图文件路径 + api_key: 阿里云百炼 API Key(可选,从环境变量读取) + + Returns: + 解析后的聊天信息 + + 示例: + result = analyze_wechat_screenshot("wechat.png") + print(result["messages"]) + """ + client = BailianVLMClient(api_key=api_key) + return client.analyze_chat_screenshot(screenshot_path)