hyf-backend/th_agenter/models/knowledge_base.py

93 lines
3.9 KiB
Python
Raw Normal View History

2026-01-21 13:45:39 +08:00
"""Knowledge base models."""
from typing import Optional
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import String, Integer, Text, Boolean, JSON
from ..db.base import BaseModel
class KnowledgeBase(BaseModel):
"""Knowledge base model."""
__tablename__ = "knowledge_bases"
name: Mapped[str] = mapped_column(String(100), unique=False, index=True, nullable=False)
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
embedding_model: Mapped[str] = mapped_column(String(100), nullable=False, default="sentence-transformers/all-MiniLM-L6-v2")
chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, default=1000)
chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, default=200)
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
# Vector database settings
vector_db_type: Mapped[str] = mapped_column(String(50), nullable=False, default="chroma")
collection_name: Mapped[Optional[str]] = mapped_column(String(100), nullable=True) # For vector DB collection
# Relationships removed to eliminate foreign key constraints
def __repr__(self):
return f"<KnowledgeBase(id={self.id}, name='{self.name}')>"
# Relationships are commented out to remove foreign key constraints, so these properties should be updated
# @property
# def document_count(self):
# """Get the number of documents in this knowledge base."""
# return len(self.documents)
# @property
# def active_document_count(self):
# """Get the number of active documents in this knowledge base."""
# return len([doc for doc in self.documents if doc.is_processed])
class Document(BaseModel):
"""Document model."""
__tablename__ = "documents"
knowledge_base_id: Mapped[int] = mapped_column(Integer, nullable=False) # Removed ForeignKey("knowledge_bases.id")
filename: Mapped[str] = mapped_column(String(255), nullable=False)
original_filename: Mapped[str] = mapped_column(String(255), nullable=False)
file_path: Mapped[str] = mapped_column(String(500), nullable=False)
file_size: Mapped[int] = mapped_column(Integer, nullable=False) # in bytes
file_type: Mapped[str] = mapped_column(String(50), nullable=False) # .pdf, .txt, .docx, etc.
mime_type: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
# Processing status
is_processed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
processing_error: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
# Content and metadata
content: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Extracted text content
doc_metadata: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) # Additional metadata
# Chunking information
chunk_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
# Embedding information
embedding_model: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
vector_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # Store vector database IDs for chunks
# Relationships removed to eliminate foreign key constraints
def __repr__(self):
return f"<Document(id={self.id}, filename='{self.filename}', kb_id={self.knowledge_base_id})>"
@property
def file_size_mb(self):
"""Get file size in MB."""
return round(self.file_size / (1024 * 1024), 2)
@property
def is_text_file(self):
"""Check if document is a text file."""
return self.file_type.lower() in ['.txt', '.md', '.csv']
@property
def is_pdf_file(self):
"""Check if document is a PDF file."""
return self.file_type.lower() == '.pdf'
@property
def is_office_file(self):
"""Check if document is an Office file."""
return self.file_type.lower() in ['.docx', '.xlsx', '.pptx']