name	phase2-core
description	Phase 2コア機能実装。マークダウンパーサー、ベクトル化処理、インデックス化、差分同期、ベクトル検索を実装。Phase 1完了後、検索機能の実装時に使用。

Phase 2: コア機能実装

devragのベクトル検索コア機能を実装します。

前提条件

Phase 1が完了していること：

Goプロジェクトが初期化されている
設定モジュールが実装されている
SQLite + vec0がセットアップされている
ONNX Runtimeが統合されている

タスク一覧

2.1 マークダウンパーサー実装

ファイル: internal/indexer/markdown.go

package indexer

import (
    "bufio"
    "fmt"
    "os"
    "strings"
)

type Chunk struct {
    Content  string
    Position int
}

// ParseMarkdown parses a markdown file and splits into chunks
func ParseMarkdown(filepath string, chunkSize int) ([]Chunk, error) {
    file, err := os.Open(filepath)
    if err != nil {
        return nil, fmt.Errorf("failed to open file: %w", err)
    }
    defer file.Close()

    // Read entire file
    var content strings.Builder
    scanner := bufio.NewScanner(file)
    for scanner.Scan() {
        content.WriteString(scanner.Text())
        content.WriteString("\n")
    }

    if err := scanner.Err(); err != nil {
        return nil, fmt.Errorf("failed to read file: %w", err)
    }

    // Split into chunks
    chunks := splitIntoChunks(content.String(), chunkSize)

    // Create Chunk structs
    result := make([]Chunk, len(chunks))
    for i, c := range chunks {
        result[i] = Chunk{
            Content:  c,
            Position: i,
        }
    }

    return result, nil
}

// splitIntoChunks splits text into chunks of approximately chunkSize characters
func splitIntoChunks(content string, chunkSize int) []string {
    if len(content) <= chunkSize {
        return []string{content}
    }

    var chunks []string
    var currentChunk strings.Builder

    // Split by paragraphs (double newline)
    paragraphs := strings.Split(content, "\n\n")

    for _, para := range paragraphs {
        para = strings.TrimSpace(para)
        if para == "" {
            continue
        }

        // If adding this paragraph exceeds chunk size, start new chunk
        if currentChunk.Len() > 0 && currentChunk.Len()+len(para) > chunkSize {
            chunks = append(chunks, currentChunk.String())
            currentChunk.Reset()
        }

        // If single paragraph is too large, split it
        if len(para) > chunkSize {
            // Split by sentences or fixed size
            subChunks := splitLargeParagraph(para, chunkSize)
            for _, sub := range subChunks {
                if currentChunk.Len() > 0 {
                    chunks = append(chunks, currentChunk.String())
                    currentChunk.Reset()
                }
                chunks = append(chunks, sub)
            }
        } else {
            if currentChunk.Len() > 0 {
                currentChunk.WriteString("\n\n")
            }
            currentChunk.WriteString(para)
        }
    }

    // Add remaining chunk
    if currentChunk.Len() > 0 {
        chunks = append(chunks, currentChunk.String())
    }

    return chunks
}

// splitLargeParagraph splits a large paragraph into smaller chunks
func splitLargeParagraph(para string, chunkSize int) []string {
    var chunks []string
    for len(para) > chunkSize {
        // Try to split at sentence boundary
        cutPoint := chunkSize
        for i := chunkSize; i > chunkSize/2; i-- {
            if para[i] == '.' || para[i] == '!' || para[i] == '?' || para[i] == '\n' {
                cutPoint = i + 1
                break
            }
        }
        chunks = append(chunks, strings.TrimSpace(para[:cutPoint]))
        para = strings.TrimSpace(para[cutPoint:])
    }
    if len(para) > 0 {
        chunks = append(chunks, para)
    }
    return chunks
}

テストケース:

短いファイル（<500文字）
長いファイル（>5000文字）
コードブロック含む
日本語・英語混在

2.2 ベクトル化処理実装

ファイル: internal/embedder/tokenizer.go

package embedder

import (
    "strings"
    "unicode"
)

// SimpleTokenizer provides basic tokenization
// Note: For production, use proper BPE tokenizer
type SimpleTokenizer struct {
    vocabSize int
}

// Tokenize converts text to token IDs (simplified)
func (t *SimpleTokenizer) Tokenize(text string) []int32 {
    // Simplified tokenization
    // In production, use proper BPE tokenizer from sentencepiece or similar

    text = strings.ToLower(text)
    words := strings.FieldsFunc(text, func(r rune) bool {
        return unicode.IsSpace(r) || unicode.IsPunct(r)
    })

    // Convert words to IDs (simplified)
    tokens := make([]int32, len(words))
    for i, word := range words {
        // Simple hash-based ID assignment
        tokens[i] = int32(hashString(word) % t.vocabSize)
    }

    return tokens
}

func hashString(s string) int {
    h := 0
    for _, c := range s {
        h = h*31 + int(c)
    }
    if h < 0 {
        h = -h
    }
    return h
}

ファイル: internal/embedder/onnx.go (update)

// Embed embeds a single text (implementation)
func (e *ONNXEmbedder) Embed(text string) ([]float32, error) {
    // TODO: Proper tokenization
    // For now, return placeholder

    // Tokenize
    tokenizer := &SimpleTokenizer{vocabSize: 30000}
    tokens := tokenizer.Tokenize(text)

    // Run inference
    // TODO: Implement ONNX inference

    // Return placeholder embedding
    embedding := make([]float32, 384)
    for i := range embedding {
        embedding[i] = 0.1
    }

    return embedding, nil
}

// EmbedBatch embeds multiple texts
func (e *ONNXEmbedder) EmbedBatch(texts []string) ([][]float32, error) {
    results := make([][]float32, len(texts))
    for i, text := range texts {
        emb, err := e.Embed(text)
        if err != nil {
            return nil, err
        }
        results[i] = emb
    }
    return results, nil
}

2.3 インデックス化処理実装

ファイル: internal/indexer/indexer.go

package indexer

import (
    "fmt"
    "os"
    "path/filepath"
    "time"

    "github.com/towada/devrag/internal/config"
    "github.com/towada/devrag/internal/embedder"
    "github.com/towada/devrag/internal/vectordb"
)

type Indexer struct {
    db       *vectordb.DB
    embedder embedder.Embedder
    config   *config.Config
}

// NewIndexer creates a new indexer
func NewIndexer(db *vectordb.DB, emb embedder.Embedder, cfg *config.Config) *Indexer {
    return &Indexer{
        db:       db,
        embedder: emb,
        config:   cfg,
    }
}

// IndexFile indexes a single markdown file
func (idx *Indexer) IndexFile(filepath string) error {
    fmt.Fprintf(os.Stderr, "[INFO] Indexing file: %s\n", filepath)

    // Get file info
    info, err := os.Stat(filepath)
    if err != nil {
        return fmt.Errorf("failed to stat file: %w", err)
    }

    // Parse markdown
    chunks, err := ParseMarkdown(filepath, idx.config.ChunkSize)
    if err != nil {
        return fmt.Errorf("failed to parse markdown: %w", err)
    }

    fmt.Fprintf(os.Stderr, "[INFO] Parsed %d chunks\n", len(chunks))

    // Vectorize chunks
    texts := make([]string, len(chunks))
    for i, chunk := range chunks {
        texts[i] = chunk.Content
    }

    vectors, err := idx.embedder.EmbedBatch(texts)
    if err != nil {
        return fmt.Errorf("failed to vectorize: %w", err)
    }

    // Store in database
    filename := filepath
    if err := idx.db.InsertDocument(filename, info.ModTime(), chunks, vectors); err != nil {
        return fmt.Errorf("failed to store in database: %w", err)
    }

    fmt.Fprintf(os.Stderr, "[INFO] Successfully indexed %s (%d chunks)\n", filename, len(chunks))
    return nil
}

// IndexDirectory indexes all markdown files in a directory
func (idx *Indexer) IndexDirectory(dir string) error {
    return filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }

        if !info.IsDir() && filepath.Ext(path) == ".md" {
            if err := idx.IndexFile(path); err != nil {
                fmt.Fprintf(os.Stderr, "[WARN] Failed to index %s: %v\n", path, err)
            }
        }

        return nil
    })
}

ファイル: internal/vectordb/sqlite.go (update)

// InsertDocument inserts a document with its chunks and vectors
func (db *DB) InsertDocument(filename string, modTime time.Time, chunks []indexer.Chunk, vectors [][]float32) error {
    tx, err := db.conn.Begin()
    if err != nil {
        return fmt.Errorf("failed to begin transaction: %w", err)
    }
    defer tx.Rollback()

    // Insert document
    result, err := tx.Exec(
        "INSERT OR REPLACE INTO documents (filename, modified_at) VALUES (?, ?)",
        filename, modTime,
    )
    if err != nil {
        return fmt.Errorf("failed to insert document: %w", err)
    }

    docID, err := result.LastInsertId()
    if err != nil {
        return fmt.Errorf("failed to get document ID: %w", err)
    }

    // Insert chunks
    for i, chunk := range chunks {
        result, err := tx.Exec(
            "INSERT INTO chunks (document_id, position, content) VALUES (?, ?, ?)",
            docID, chunk.Position, chunk.Content,
        )
        if err != nil {
            return fmt.Errorf("failed to insert chunk: %w", err)
        }

        chunkID, err := result.LastInsertId()
        if err != nil {
            return fmt.Errorf("failed to get chunk ID: %w", err)
        }

        // Insert vector
        // TODO: Insert into vec_chunks virtual table
        _ = chunkID
        _ = vectors[i]
    }

    return tx.Commit()
}

2.4 差分同期機能実装

ファイル: internal/indexer/sync.go

package indexer

import (
    "fmt"
    "os"
    "path/filepath"
    "time"
)

type SyncResult struct {
    Added    []string
    Updated  []string
    Deleted  []string
}

// Sync synchronizes the documents directory with the database
func (idx *Indexer) Sync() (*SyncResult, error) {
    fmt.Fprintf(os.Stderr, "[INFO] Starting sync...\n")

    result := &SyncResult{
        Added:   []string{},
        Updated: []string{},
        Deleted: []string{},
    }

    // Get files from database
    dbFiles, err := idx.db.ListDocuments()
    if err != nil {
        return nil, fmt.Errorf("failed to list database files: %w", err)
    }

    dbFileMap := make(map[string]time.Time)
    for filename, modTime := range dbFiles {
        dbFileMap[filename] = modTime
    }

    // Scan filesystem
    fsFiles := make(map[string]time.Time)
    err = filepath.Walk(idx.config.DocumentsDir, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            return err
        }

        if !info.IsDir() && filepath.Ext(path) == ".md" {
            fsFiles[path] = info.ModTime()
        }

        return nil
    })
    if err != nil {
        return nil, fmt.Errorf("failed to scan filesystem: %w", err)
    }

    // Detect changes
    for fsPath, fsMtime := range fsFiles {
        if dbMtime, exists := dbFileMap[fsPath]; !exists {
            // New file
            result.Added = append(result.Added, fsPath)
            if err := idx.IndexFile(fsPath); err != nil {
                fmt.Fprintf(os.Stderr, "[ERROR] Failed to index new file %s: %v\n", fsPath, err)
            }
        } else if !fsMtime.Equal(dbMtime) {
            // Updated file
            result.Updated = append(result.Updated, fsPath)
            if err := idx.db.DeleteDocument(fsPath); err != nil {
                fmt.Fprintf(os.Stderr, "[ERROR] Failed to delete old version of %s: %v\n", fsPath, err)
                continue
            }
            if err := idx.IndexFile(fsPath); err != nil {
                fmt.Fprintf(os.Stderr, "[ERROR] Failed to reindex %s: %v\n", fsPath, err)
            }
        }
    }

    // Detect deletions
    for dbPath := range dbFileMap {
        if _, exists := fsFiles[dbPath]; !exists {
            result.Deleted = append(result.Deleted, dbPath)
            if err := idx.db.DeleteDocument(dbPath); err != nil {
                fmt.Fprintf(os.Stderr, "[ERROR] Failed to delete %s from database: %v\n", dbPath, err)
            }
        }
    }

    fmt.Fprintf(os.Stderr, "[INFO] Sync complete: +%d, ~%d, -%d\n",
        len(result.Added), len(result.Updated), len(result.Deleted))

    return result, nil
}

2.5 ベクトル検索実装

ファイル: internal/vectordb/search.go

package vectordb

import (
    "fmt"
)

type SearchResult struct {
    DocumentName string
    ChunkContent string
    Similarity   float64
    Position     int
}

// Search performs vector similarity search
func (db *DB) Search(queryVector []float32, topK int) ([]SearchResult, error) {
    // TODO: Implement vec0 similarity search
    // SQL query:
    // SELECT
    //     d.filename,
    //     c.content,
    //     c.position,
    //     vec_distance_cosine(v.embedding, ?) as similarity
    // FROM vec_chunks v
    // JOIN chunks c ON v.chunk_id = c.id
    // JOIN documents d ON c.document_id = d.id
    // ORDER BY similarity ASC
    // LIMIT ?

    // Placeholder implementation
    results := []SearchResult{}
    return results, nil
}

// ListDocuments returns all indexed documents
func (db *DB) ListDocuments() (map[string]time.Time, error) {
    rows, err := db.conn.Query("SELECT filename, modified_at FROM documents")
    if err != nil {
        return nil, fmt.Errorf("failed to query documents: %w", err)
    }
    defer rows.Close()

    docs := make(map[string]time.Time)
    for rows.Next() {
        var filename string
        var modTime time.Time
        if err := rows.Scan(&filename, &modTime); err != nil {
            return nil, fmt.Errorf("failed to scan row: %w", err)
        }
        docs[filename] = modTime
    }

    return docs, rows.Err()
}

// DeleteDocument deletes a document and its chunks
func (db *DB) DeleteDocument(filename string) error {
    _, err := db.conn.Exec("DELETE FROM documents WHERE filename = ?", filename)
    return err
}

Phase 2 完了条件

マークダウンファイルがパースできる
チャンク分割が正しく動作する
テキストがベクトル化できる
インデックス化処理が完了する
差分同期が動作する
検索クエリが実行できる（vec0統合後）

パフォーマンス目標

インデックス速度: > 100 chunks/sec
検索レスポンス: < 500ms（1000件/DB）
メモリ使用量: < 500MB

次のステップ

Phase 2完了後は phase3-mcp スキルを使用してMCPプロトコルを統合します。

phase2-core

Install Skill

SKILL.md