Commit c999976d authored by Andrey Filippov's avatar Andrey Filippov

RAG-related scripts

parent 7e9c0c3e
......@@ -80,3 +80,108 @@ Outputs:
- CSV file with per-sensor azimuth/tilt/roll columns and stats
Tags: csv, models, aggregation, plot
## rag_index.py
Build a local RAG index from attic/CODEX/rag_sources using fastembed + hnswlib.
Path: `scripts/rag_index.py`
Example:
```bash
/home/elphel/git/imagej-elphel/scripts/rag_index.py --source /home/elphel/git/imagej-elphel/attic/CODEX/rag_sources --out /home/elphel/git/imagej-elphel/attic/CODEX/rag_index
```
Inputs:
- rag_sources root directory
- embedding model name (default: BAAI/bge-base-en-v1.5)
Outputs:
- index.bin
- meta.jsonl
- config.json
Dependencies:
- fastembed
- hnswlib
- numpy
Tags: rag, index, embedding
## rag_index.sh
Wrapper to run rag_index.py using the RAG venv.
Path: `scripts/rag_index.sh`
Example:
```bash
/home/elphel/git/imagej-elphel/scripts/rag_index.sh --source /home/elphel/git/imagej-elphel/attic/CODEX/rag_sources --out /home/elphel/git/imagej-elphel/attic/CODEX/rag_index
```
Inputs:
- same args as rag_index.py
Outputs:
- index.bin
- meta.jsonl
- config.json
Dependencies:
- rag_index.py
- rag venv
Tags: rag, index, wrapper
## rag_query.py
Query a local RAG index built by rag_index.py.
Path: `scripts/rag_query.py`
Example:
```bash
/home/elphel/git/imagej-elphel/scripts/rag_query.py "Explain differential rectification"
```
Inputs:
- query string
- index directory (default: attic/CODEX/rag_index)
Outputs:
- Top-k matching chunks with file paths and snippets
Dependencies:
- fastembed
- hnswlib
- numpy
Tags: rag, query
## rag_query.sh
Wrapper to run rag_query.py using the RAG venv.
Path: `scripts/rag_query.sh`
Example:
```bash
/home/elphel/git/imagej-elphel/scripts/rag_query.sh "Explain differential rectification"
```
Inputs:
- query string
- index directory (default: attic/CODEX/rag_index)
Outputs:
- Top-k matching chunks with file paths and snippets
Dependencies:
- rag_query.py
- rag venv
Tags: rag, query, wrapper
......@@ -51,6 +51,91 @@
"dependencies": [],
"owner": "codex",
"created": "2026-02-03"
},
{
"name": "rag_index.py",
"path": "scripts/rag_index.py",
"purpose": "Build a local RAG index from attic/CODEX/rag_sources using fastembed + hnswlib.",
"inputs": [
"rag_sources root directory",
"embedding model name (default: BAAI/bge-base-en-v1.5)"
],
"outputs": [
"index.bin",
"meta.jsonl",
"config.json"
],
"example": "/home/elphel/git/imagej-elphel/scripts/rag_index.py --source /home/elphel/git/imagej-elphel/attic/CODEX/rag_sources --out /home/elphel/git/imagej-elphel/attic/CODEX/rag_index",
"tags": ["rag", "index", "embedding"],
"dependencies": [
"fastembed",
"hnswlib",
"numpy"
],
"owner": "codex",
"created": "2026-02-04"
},
{
"name": "rag_index.sh",
"path": "scripts/rag_index.sh",
"purpose": "Wrapper to run rag_index.py using the RAG venv.",
"inputs": [
"same args as rag_index.py"
],
"outputs": [
"index.bin",
"meta.jsonl",
"config.json"
],
"example": "/home/elphel/git/imagej-elphel/scripts/rag_index.sh --source /home/elphel/git/imagej-elphel/attic/CODEX/rag_sources --out /home/elphel/git/imagej-elphel/attic/CODEX/rag_index",
"tags": ["rag", "index", "wrapper"],
"dependencies": [
"rag_index.py",
"rag venv"
],
"owner": "codex",
"created": "2026-02-04"
},
{
"name": "rag_query.py",
"path": "scripts/rag_query.py",
"purpose": "Query a local RAG index built by rag_index.py.",
"inputs": [
"query string",
"index directory (default: attic/CODEX/rag_index)"
],
"outputs": [
"Top-k matching chunks with file paths and snippets"
],
"example": "/home/elphel/git/imagej-elphel/scripts/rag_query.py \"Explain differential rectification\"",
"tags": ["rag", "query"],
"dependencies": [
"fastembed",
"hnswlib",
"numpy"
],
"owner": "codex",
"created": "2026-02-04"
},
{
"name": "rag_query.sh",
"path": "scripts/rag_query.sh",
"purpose": "Wrapper to run rag_query.py using the RAG venv.",
"inputs": [
"query string",
"index directory (default: attic/CODEX/rag_index)"
],
"outputs": [
"Top-k matching chunks with file paths and snippets"
],
"example": "/home/elphel/git/imagej-elphel/scripts/rag_query.sh \"Explain differential rectification\"",
"tags": ["rag", "query", "wrapper"],
"dependencies": [
"rag_query.py",
"rag venv"
],
"owner": "codex",
"created": "2026-02-04"
}
]
}
......@@ -136,6 +136,17 @@ curl "http://127.0.0.1:48888/mcp/fs/csvcol?path=/path/to/file.csv&col=3"
curl "http://127.0.0.1:48888/mcp/fs/csvcol?path=/path/to/file.csv&col=3&sep=,"
```
### 7) RAG query (local index)
Query the local RAG index built by `scripts/rag_index.py`.
Query parameters:
- `text` (required): query string
- `topK` (optional): number of results (default 5)
```
curl "http://127.0.0.1:48888/mcp/rag/query?text=Explain%20differential%20rectification&topK=5"
```
Example (config directory):
```
-Delphel.mcp.allowed.configdir=/media/elphel/btrfs-data/lwir16-proc/NC/config
......
#!/usr/bin/env python3
"""Build a local RAG index from attic/CODEX/rag_sources.
Outputs:
- index.bin (hnswlib)
- meta.jsonl (chunk metadata)
- config.json
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Tuple
import hnswlib
import numpy as np
from fastembed import TextEmbedding
from tqdm import tqdm
def sha1_text(text: str) -> str:
h = hashlib.sha1()
h.update(text.encode("utf-8", errors="ignore"))
return h.hexdigest()
def read_text_file(path: Path) -> str:
return path.read_text(encoding="utf-8", errors="ignore")
def latex_to_text(text: str) -> str:
# Remove comments
text = re.sub(r"(?m)^\s*%.*$", "", text)
# Drop LaTeX commands but keep their arguments
text = re.sub(r"\\[a-zA-Z*]+\s*\{", "{", text)
text = re.sub(r"\\[a-zA-Z*]+\s*\[.*?\]", "", text)
text = re.sub(r"\\[a-zA-Z*]+", " ", text)
# Strip remaining braces
text = text.replace("{", " ").replace("}", " ")
# Collapse whitespace
text = re.sub(r"\s+", " ", text)
return text.strip()
def pdf_to_text(path: Path) -> str:
# pdftotext writes to stdout with '-' output
try:
result = subprocess.run(
["pdftotext", "-q", str(path), "-"],
check=True,
capture_output=True,
)
except subprocess.CalledProcessError as exc:
raise RuntimeError(f"pdftotext failed for {path}: {exc}")
return result.stdout.decode("utf-8", errors="ignore")
def iter_source_files(root: Path, exclude_substrings: List[str]) -> Iterable[Path]:
exts = {".md", ".txt", ".tex", ".pdf"}
for p in root.rglob("*"):
if not p.is_file():
continue
path_str = str(p)
if any(token in path_str for token in exclude_substrings):
continue
if p.suffix.lower() in exts:
yield p
def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
if not text:
return []
chunks: List[str] = []
n = len(text)
start = 0
while start < n:
end = min(n, start + chunk_size)
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
if end >= n:
break
start = max(0, end - overlap)
return chunks
def file_to_chunks(path: Path, chunk_size: int, overlap: int) -> List[str]:
suffix = path.suffix.lower()
if suffix == ".pdf":
text = pdf_to_text(path)
else:
text = read_text_file(path)
if suffix == ".tex":
text = latex_to_text(text)
return chunk_text(text, chunk_size, overlap)
def main() -> int:
parser = argparse.ArgumentParser(description="Build local RAG index")
parser.add_argument(
"--source",
default="/home/elphel/git/imagej-elphel/attic/CODEX/rag_sources",
help="Root directory with sources",
)
parser.add_argument(
"--out",
default="/home/elphel/git/imagej-elphel/attic/CODEX/rag_index",
help="Output directory",
)
parser.add_argument(
"--model",
default="BAAI/bge-base-en-v1.5",
help="Embedding model for fastembed",
)
parser.add_argument("--chunk-size", type=int, default=3000)
parser.add_argument("--overlap", type=int, default=300)
parser.add_argument("--top-k", type=int, default=10)
parser.add_argument(
"--exclude",
action="append",
default=["/elphel-bib-glossary/"],
help="Exclude paths containing this substring (can be repeated)",
)
args = parser.parse_args()
source_root = Path(args.source).resolve()
out_dir = Path(args.out).resolve()
out_dir.mkdir(parents=True, exist_ok=True)
files = list(iter_source_files(source_root, args.exclude))
if not files:
print(f"No source files found under {source_root}")
return 1
embedding = TextEmbedding(model_name=args.model)
meta_path = out_dir / "meta.jsonl"
index_path = out_dir / "index.bin"
config_path = out_dir / "config.json"
# Build chunks and embeddings
all_meta = []
all_vectors = []
for path in tqdm(files, desc="Files"):
try:
chunks = file_to_chunks(path, args.chunk_size, args.overlap)
except Exception as exc:
print(f"WARN: failed to read {path}: {exc}")
continue
if not chunks:
continue
rel_path = path.relative_to(source_root)
embeddings = list(embedding.embed(chunks))
for idx, (chunk, vec) in enumerate(zip(chunks, embeddings)):
vec = np.array(vec, dtype=np.float32)
# normalize for cosine similarity
norm = np.linalg.norm(vec)
if norm > 0:
vec = vec / norm
all_vectors.append(vec)
all_meta.append(
{
"id": len(all_meta),
"source": str(rel_path),
"path": str(path),
"chunk_index": idx,
"text": chunk,
"sha1": sha1_text(chunk),
}
)
if not all_vectors:
print("No embeddings created; aborting")
return 1
dim = len(all_vectors[0])
index = hnswlib.Index(space="cosine", dim=dim)
index.init_index(max_elements=len(all_vectors), ef_construction=200, M=16)
index.add_items(np.vstack(all_vectors), np.arange(len(all_vectors)))
index.set_ef(100)
index.save_index(str(index_path))
with meta_path.open("w", encoding="utf-8") as f:
for item in all_meta:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
config = {
"source": str(source_root),
"out": str(out_dir),
"model": args.model,
"chunk_size": args.chunk_size,
"overlap": args.overlap,
"count": len(all_meta),
"created": datetime.now(timezone.utc).isoformat(),
}
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
print(f"Index built: {index_path}")
print(f"Metadata: {meta_path}")
print(f"Config: {config_path}")
print(f"Chunks: {len(all_meta)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
#!/usr/bin/env bash
set -euo pipefail
VENV="/home/elphel/git/imagej-elphel/attic/CODEX/rag_index/.venv"
exec "$VENV/bin/python" /home/elphel/git/imagej-elphel/scripts/rag_index.py "$@"
#!/usr/bin/env python3
"""Query a local RAG index built by rag_index.py."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import List
import hnswlib
import numpy as np
from fastembed import TextEmbedding
def load_meta(meta_path: Path) -> List[dict]:
meta = []
with meta_path.open("r", encoding="utf-8") as f:
for line in f:
meta.append(json.loads(line))
return meta
def main() -> int:
parser = argparse.ArgumentParser(description="Query local RAG index")
parser.add_argument("query", help="Query text")
parser.add_argument(
"--index-dir",
default="/home/elphel/git/imagej-elphel/attic/CODEX/rag_index",
help="Index directory",
)
parser.add_argument("--top-k", type=int, default=5)
parser.add_argument("--json", action="store_true", help="Output JSON only")
args = parser.parse_args()
index_dir = Path(args.index_dir)
config = json.loads((index_dir / "config.json").read_text(encoding="utf-8"))
meta = load_meta(index_dir / "meta.jsonl")
model = config.get("model", "BAAI/bge-base-en-v1.5")
embedding = TextEmbedding(model_name=model)
query_vec = list(embedding.embed([args.query]))[0]
query_vec = np.array(query_vec, dtype=np.float32)
norm = np.linalg.norm(query_vec)
if norm > 0:
query_vec = query_vec / norm
# Load index
dim = len(query_vec)
index = hnswlib.Index(space="cosine", dim=dim)
index.load_index(str(index_dir / "index.bin"))
index.set_ef(100)
labels, distances = index.knn_query(query_vec, k=args.top_k)
results = []
for rank, (idx, dist) in enumerate(zip(labels[0], distances[0]), start=1):
item = meta[idx]
snippet = item["text"][:400].replace("\n", " ")
results.append(
{
"rank": rank,
"source": item["source"],
"chunk_index": item["chunk_index"],
"score": float(1.0 - dist),
"snippet": snippet,
}
)
if args.json:
print(json.dumps({"ok": True, "query": args.query, "results": results}, ensure_ascii=False))
return 0
print(f"Query: {args.query}")
print("Results:\n")
for item in results:
print(f"{item['rank']}. {item['source']} (chunk {item['chunk_index']})")
print(f" score: {item['score']:.4f}")
print(f" {item['snippet']}")
print()
return 0
if __name__ == "__main__":
raise SystemExit(main())
#!/usr/bin/env bash
set -euo pipefail
VENV="/home/elphel/git/imagej-elphel/attic/CODEX/rag_index/.venv"
exec "$VENV/bin/python" /home/elphel/git/imagej-elphel/scripts/rag_query.py "$@"
......@@ -103,6 +103,7 @@ public class McpServer {
server.createContext("/mcp/fs/tail", new FsTailHandler());
server.createContext("/mcp/fs/glob", new FsGlobHandler());
server.createContext("/mcp/fs/csvcol", new FsCsvColHandler());
server.createContext("/mcp/rag/query", new RagQueryHandler());
server.setExecutor(null);
server.start();
if (Eyesis_Correction.MCP_DEBUG_LEVEL >= Eyesis_Correction.MINIMAL_DEBUG_MCP) {
......@@ -537,6 +538,64 @@ public class McpServer {
}
}
// codex 2026-02-05: query local RAG index via rag_query.py
private class RagQueryHandler implements HttpHandler {
@Override
public void handle(HttpExchange exchange) throws IOException {
Map<String, String> params = parseParams(exchange);
String text = params.get("text");
if (text == null || text.isEmpty()) {
sendJson(exchange, 400, "{\"ok\":false,\"error\":\"Missing text\"}");
return;
}
int topK = parseInt(params.get("topK"), 5);
if (topK < 1) {
topK = 1;
}
String venvPython = "/home/elphel/git/imagej-elphel/attic/CODEX/rag_index/.venv/bin/python";
String script = "/home/elphel/git/imagej-elphel/scripts/rag_query.py";
List<String> cmd = new ArrayList<String>();
cmd.add(venvPython);
cmd.add(script);
cmd.add("--json");
cmd.add("--top-k");
cmd.add(String.valueOf(topK));
cmd.add(text);
ProcessBuilder pb = new ProcessBuilder(cmd);
pb.redirectErrorStream(true);
Process p;
try {
p = pb.start();
} catch (IOException e) {
sendJson(exchange, 500, "{\"ok\":false,\"error\":\"Failed to start rag_query\"}");
return;
}
byte[] out;
try {
out = p.getInputStream().readAllBytes();
} catch (IOException e) {
sendJson(exchange, 500, "{\"ok\":false,\"error\":\"Failed to read rag_query output\"}");
return;
}
try {
p.waitFor();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
String output = new String(out, StandardCharsets.UTF_8).trim();
if (p.exitValue() != 0) {
String err = jsonEscape(output);
sendJson(exchange, 500, "{\"ok\":false,\"error\":\"rag_query failed\",\"detail\":\"" + err + "\"}");
return;
}
if (output.isEmpty()) {
sendJson(exchange, 500, "{\"ok\":false,\"error\":\"rag_query returned empty\"}");
return;
}
sendJson(exchange, 200, output);
}
}
private String buildStatusJson() {
int stopRequested = owner.getSyncStopRequested();
StringBuilder sb = new StringBuilder();
......@@ -607,6 +666,10 @@ public class McpServer {
byte[] data = body.getBytes(StandardCharsets.UTF_8);
Headers headers = exchange.getResponseHeaders();
headers.set("Content-Type", "application/json; charset=utf-8");
// codex 2026-02-05: allow simple browser-based MCP clients
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET,POST,OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type");
exchange.sendResponseHeaders(code, data.length);
OutputStream os = exchange.getResponseBody();
os.write(data);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment