hyf-backend/th_agenter/models/knowledge_base.py

"""Knowledge base models."""

from typing import Optional
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import String, Integer, Text, Boolean, JSON

from ..db.base import BaseModel

class KnowledgeBase(BaseModel):
    """Knowledge base model."""
    
    __tablename__ = "knowledge_bases"
    
    name: Mapped[str] = mapped_column(String(100), unique=False, index=True, nullable=False)
    description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
    embedding_model: Mapped[str] = mapped_column(String(100), nullable=False, default="sentence-transformers/all-MiniLM-L6-v2")
    chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, default=1000)
    chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, default=200)
    is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
    
    # Vector database settings
    vector_db_type: Mapped[str] = mapped_column(String(50), nullable=False, default="chroma")
    collection_name: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)  # For vector DB collection
    
    # Relationships removed to eliminate foreign key constraints
    
    def __repr__(self):
        return f"<KnowledgeBase(id={self.id}, name='{self.name}')>"
    
    # Relationships are commented out to remove foreign key constraints, so these properties should be updated
    # @property
    # def document_count(self):
    #     """Get the number of documents in this knowledge base."""
    #     return len(self.documents)
    
    # @property
    # def active_document_count(self):
    #     """Get the number of active documents in this knowledge base."""
    #     return len([doc for doc in self.documents if doc.is_processed])


class Document(BaseModel):
    """Document model."""
    
    __tablename__ = "documents"
    
    knowledge_base_id: Mapped[int] = mapped_column(Integer, nullable=False)  # Removed ForeignKey("knowledge_bases.id")
    filename: Mapped[str] = mapped_column(String(255), nullable=False)
    original_filename: Mapped[str] = mapped_column(String(255), nullable=False)
    file_path: Mapped[str] = mapped_column(String(500), nullable=False)
    file_size: Mapped[int] = mapped_column(Integer, nullable=False)  # in bytes
    file_type: Mapped[str] = mapped_column(String(50), nullable=False)  # .pdf, .txt, .docx, etc.
    mime_type: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
    
    # Processing status
    is_processed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
    processing_error: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
    
    # Content and metadata
    content: Mapped[Optional[str]] = mapped_column(Text, nullable=True)  # Extracted text content
    doc_metadata: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True)  # Additional metadata
    
    # Chunking information
    chunk_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
    
    # Embedding information
    embedding_model: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
    vector_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True)  # Store vector database IDs for chunks
    
    # Relationships removed to eliminate foreign key constraints
    
    def __repr__(self):
        return f"<Document(id={self.id}, filename='{self.filename}', kb_id={self.knowledge_base_id})>"
    
    @property
    def file_size_mb(self):
        """Get file size in MB."""
        return round(self.file_size / (1024 * 1024), 2)
    
    @property
    def is_text_file(self):
        """Check if document is a text file."""
        return self.file_type.lower() in ['.txt', '.md', '.csv']
    
    @property
    def is_pdf_file(self):
        """Check if document is a PDF file."""
        return self.file_type.lower() == '.pdf'
    
    @property
    def is_office_file(self):
        """Check if document is an Office file."""
        return self.file_type.lower() in ['.docx', '.xlsx', '.pptx']
chore: 提交所有代码 2026-01-21 13:45:39 +08:00			`"""Knowledge base models."""`

			`from typing import Optional`
			`from sqlalchemy.orm import Mapped, mapped_column`
			`from sqlalchemy import String, Integer, Text, Boolean, JSON`

			`from ..db.base import BaseModel`

			`class KnowledgeBase(BaseModel):`
			`"""Knowledge base model."""`

			`__tablename__ = "knowledge_bases"`

			`name: Mapped[str] = mapped_column(String(100), unique=False, index=True, nullable=False)`
			`description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)`
			`embedding_model: Mapped[str] = mapped_column(String(100), nullable=False, default="sentence-transformers/all-MiniLM-L6-v2")`
			`chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, default=1000)`
			`chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, default=200)`
			`is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)`

			`# Vector database settings`
			`vector_db_type: Mapped[str] = mapped_column(String(50), nullable=False, default="chroma")`
			`collection_name: Mapped[Optional[str]] = mapped_column(String(100), nullable=True) # For vector DB collection`

			`# Relationships removed to eliminate foreign key constraints`

			`def __repr__(self):`
			`return f"<KnowledgeBase(id={self.id}, name='{self.name}')>"`

			`# Relationships are commented out to remove foreign key constraints, so these properties should be updated`
			`# @property`
			`# def document_count(self):`
			`# """Get the number of documents in this knowledge base."""`
			`# return len(self.documents)`

			`# @property`
			`# def active_document_count(self):`
			`# """Get the number of active documents in this knowledge base."""`
			`# return len([doc for doc in self.documents if doc.is_processed])`


			`class Document(BaseModel):`
			`"""Document model."""`

			`__tablename__ = "documents"`

			`knowledge_base_id: Mapped[int] = mapped_column(Integer, nullable=False) # Removed ForeignKey("knowledge_bases.id")`
			`filename: Mapped[str] = mapped_column(String(255), nullable=False)`
			`original_filename: Mapped[str] = mapped_column(String(255), nullable=False)`
			`file_path: Mapped[str] = mapped_column(String(500), nullable=False)`
			`file_size: Mapped[int] = mapped_column(Integer, nullable=False) # in bytes`
			`file_type: Mapped[str] = mapped_column(String(50), nullable=False) # .pdf, .txt, .docx, etc.`
			`mime_type: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)`

			`# Processing status`
			`is_processed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)`
			`processing_error: Mapped[Optional[str]] = mapped_column(Text, nullable=True)`

			`# Content and metadata`
			`content: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Extracted text content`
			`doc_metadata: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) # Additional metadata`

			`# Chunking information`
			`chunk_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)`

			`# Embedding information`
			`embedding_model: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)`
			`vector_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # Store vector database IDs for chunks`

			`# Relationships removed to eliminate foreign key constraints`

			`def __repr__(self):`
			`return f"<Document(id={self.id}, filename='{self.filename}', kb_id={self.knowledge_base_id})>"`

			`@property`
			`def file_size_mb(self):`
			`"""Get file size in MB."""`
			`return round(self.file_size / (1024 * 1024), 2)`

			`@property`
			`def is_text_file(self):`
			`"""Check if document is a text file."""`
			`return self.file_type.lower() in ['.txt', '.md', '.csv']`

			`@property`
			`def is_pdf_file(self):`
			`"""Check if document is a PDF file."""`
			`return self.file_type.lower() == '.pdf'`

			`@property`
			`def is_office_file(self):`
			`"""Check if document is an Office file."""`
			`return self.file_type.lower() in ['.docx', '.xlsx', '.pptx']`