File size: 2,771 Bytes
79899c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from dataclasses import dataclass, field
from typing import Optional
from utils.snowflake_id import snowflake_id_str


@dataclass
class BaseBioDocument:
    """
    生物医学文档基础类
    包含所有搜索类型共有的字段
    """

    bio_id: Optional[str] = field(default_factory=snowflake_id_str)
    title: Optional[str] = None
    text: Optional[str] = None
    source: Optional[str] = None
    source_id: Optional[str] = None


@dataclass
class PubMedDocument(BaseBioDocument):
    """
    PubMed学术文献文档
    包含学术文献特有的字段
    """

    abstract: Optional[str] = None
    authors: Optional[str] = None
    doi: Optional[str] = None
    journal: Optional[str] = None
    pub_date: Optional[str] = None
    if_score: Optional[float] = None
    url: Optional[str] = None

    def __post_init__(self):
        if self.source is None:
            self.source = "pubmed"


@dataclass
class PersonalDocument(BaseBioDocument):
    """
    个人向量搜索文档
    包含个人文档特有的字段
    """

    if_score: Optional[float] = None
    doc_id: Optional[str] = None
    index: Optional[int] = 0
    user_id: Optional[str] = None
    file_name: Optional[str] = None

    def __post_init__(self):
        if self.source is None:
            self.source = "personal_vector"


@dataclass
class WebDocument(BaseBioDocument):
    """
    Web搜索文档
    包含网页内容特有的字段
    """

    url: Optional[str] = None
    description: Optional[str] = None

    def __post_init__(self):
        if self.source is None:
            self.source = "web"


# 为了保持向后兼容,保留原有的BioDocument类
@dataclass
class BioDocument(BaseBioDocument):
    """
    生物医学文档(向后兼容)
    包含所有可能的字段,但建议使用专门的文档类型
    """

    abstract: Optional[str] = None
    authors: Optional[str] = None
    doi: Optional[str] = None
    journal: Optional[str] = None
    pub_date: Optional[str] = None
    if_score: Optional[float] = None
    url: Optional[str] = None
    doc_id: Optional[str] = None


# 工厂函数,根据source类型创建相应的文档对象
def create_bio_document(source: str, **kwargs) -> BaseBioDocument:
    """
    根据source类型创建相应的文档对象

    Args:
        source: 文档来源类型 ("pubmed", "personal_vector", "web")
        **kwargs: 文档字段

    Returns:
        相应的文档对象
    """
    if source == "pubmed":
        return PubMedDocument(**kwargs)
    elif source == "personal_vector":
        return PersonalDocument(**kwargs)
    elif source == "web":
        return WebDocument(**kwargs)
    else:
        # 默认使用通用BioDocument
        return BioDocument(**kwargs)