vector.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. # dialects/oracle/vector.py
  2. # Copyright (C) 2005-2025 the SQLAlchemy authors and contributors
  3. # <see AUTHORS file>
  4. #
  5. # This module is part of SQLAlchemy and is released under
  6. # the MIT License: https://www.opensource.org/licenses/mit-license.php
  7. # mypy: ignore-errors
  8. from __future__ import annotations
  9. import array
  10. from dataclasses import dataclass
  11. from enum import Enum
  12. from typing import Optional
  13. from typing import Union
  14. import sqlalchemy.types as types
  15. from sqlalchemy.types import Float
  16. class VectorIndexType(Enum):
  17. """Enum representing different types of VECTOR index structures.
  18. See :ref:`oracle_vector_datatype` for background.
  19. .. versionadded:: 2.0.41
  20. """
  21. HNSW = "HNSW"
  22. """
  23. The HNSW (Hierarchical Navigable Small World) index type.
  24. """
  25. IVF = "IVF"
  26. """
  27. The IVF (Inverted File Index) index type
  28. """
  29. class VectorDistanceType(Enum):
  30. """Enum representing different types of vector distance metrics.
  31. See :ref:`oracle_vector_datatype` for background.
  32. .. versionadded:: 2.0.41
  33. """
  34. EUCLIDEAN = "EUCLIDEAN"
  35. """Euclidean distance (L2 norm).
  36. Measures the straight-line distance between two vectors in space.
  37. """
  38. DOT = "DOT"
  39. """Dot product similarity.
  40. Measures the algebraic similarity between two vectors.
  41. """
  42. COSINE = "COSINE"
  43. """Cosine similarity.
  44. Measures the cosine of the angle between two vectors.
  45. """
  46. MANHATTAN = "MANHATTAN"
  47. """Manhattan distance (L1 norm).
  48. Calculates the sum of absolute differences across dimensions.
  49. """
  50. class VectorStorageFormat(Enum):
  51. """Enum representing the data format used to store vector components.
  52. See :ref:`oracle_vector_datatype` for background.
  53. .. versionadded:: 2.0.41
  54. """
  55. INT8 = "INT8"
  56. """
  57. 8-bit integer format.
  58. """
  59. BINARY = "BINARY"
  60. """
  61. Binary format.
  62. """
  63. FLOAT32 = "FLOAT32"
  64. """
  65. 32-bit floating-point format.
  66. """
  67. FLOAT64 = "FLOAT64"
  68. """
  69. 64-bit floating-point format.
  70. """
  71. class VectorStorageType(Enum):
  72. """Enum representing the vector type,
  73. See :ref:`oracle_vector_datatype` for background.
  74. .. versionadded:: 2.0.43
  75. """
  76. SPARSE = "SPARSE"
  77. """
  78. A Sparse vector is a vector which has zero value for
  79. most of its dimensions.
  80. """
  81. DENSE = "DENSE"
  82. """
  83. A Dense vector is a vector where most, if not all, elements
  84. hold meaningful values.
  85. """
  86. @dataclass
  87. class VectorIndexConfig:
  88. """Define the configuration for Oracle VECTOR Index.
  89. See :ref:`oracle_vector_datatype` for background.
  90. .. versionadded:: 2.0.41
  91. :param index_type: Enum value from :class:`.VectorIndexType`
  92. Specifies the indexing method. For HNSW, this must be
  93. :attr:`.VectorIndexType.HNSW`.
  94. :param distance: Enum value from :class:`.VectorDistanceType`
  95. specifies the metric for calculating distance between VECTORS.
  96. :param accuracy: interger. Should be in the range 0 to 100
  97. Specifies the accuracy of the nearest neighbor search during
  98. query execution.
  99. :param parallel: integer. Specifies degree of parallelism.
  100. :param hnsw_neighbors: interger. Should be in the range 0 to
  101. 2048. Specifies the number of nearest neighbors considered
  102. during the search. The attribute :attr:`.VectorIndexConfig.hnsw_neighbors`
  103. is HNSW index specific.
  104. :param hnsw_efconstruction: integer. Should be in the range 0
  105. to 65535. Controls the trade-off between indexing speed and
  106. recall quality during index construction. The attribute
  107. :attr:`.VectorIndexConfig.hnsw_efconstruction` is HNSW index
  108. specific.
  109. :param ivf_neighbor_partitions: integer. Should be in the range
  110. 0 to 10,000,000. Specifies the number of partitions used to
  111. divide the dataset. The attribute
  112. :attr:`.VectorIndexConfig.ivf_neighbor_partitions` is IVF index
  113. specific.
  114. :param ivf_sample_per_partition: integer. Should be between 1
  115. and ``num_vectors / neighbor partitions``. Specifies the
  116. number of samples used per partition. The attribute
  117. :attr:`.VectorIndexConfig.ivf_sample_per_partition` is IVF index
  118. specific.
  119. :param ivf_min_vectors_per_partition: integer. From 0 (no trimming)
  120. to the total number of vectors (results in 1 partition). Specifies
  121. the minimum number of vectors per partition. The attribute
  122. :attr:`.VectorIndexConfig.ivf_min_vectors_per_partition`
  123. is IVF index specific.
  124. """
  125. index_type: VectorIndexType = VectorIndexType.HNSW
  126. distance: Optional[VectorDistanceType] = None
  127. accuracy: Optional[int] = None
  128. hnsw_neighbors: Optional[int] = None
  129. hnsw_efconstruction: Optional[int] = None
  130. ivf_neighbor_partitions: Optional[int] = None
  131. ivf_sample_per_partition: Optional[int] = None
  132. ivf_min_vectors_per_partition: Optional[int] = None
  133. parallel: Optional[int] = None
  134. def __post_init__(self):
  135. self.index_type = VectorIndexType(self.index_type)
  136. for field in [
  137. "hnsw_neighbors",
  138. "hnsw_efconstruction",
  139. "ivf_neighbor_partitions",
  140. "ivf_sample_per_partition",
  141. "ivf_min_vectors_per_partition",
  142. "parallel",
  143. "accuracy",
  144. ]:
  145. value = getattr(self, field)
  146. if value is not None and not isinstance(value, int):
  147. raise TypeError(
  148. f"{field} must be an integer if"
  149. f"provided, got {type(value).__name__}"
  150. )
  151. class SparseVector:
  152. """
  153. Lightweight SQLAlchemy-side version of SparseVector.
  154. This mimics oracledb.SparseVector.
  155. .. versionadded:: 2.0.43
  156. """
  157. def __init__(
  158. self,
  159. num_dimensions: int,
  160. indices: Union[list, array.array],
  161. values: Union[list, array.array],
  162. ):
  163. if not isinstance(indices, array.array) or indices.typecode != "I":
  164. indices = array.array("I", indices)
  165. if not isinstance(values, array.array):
  166. values = array.array("d", values)
  167. if len(indices) != len(values):
  168. raise TypeError("indices and values must be of the same length!")
  169. self.num_dimensions = num_dimensions
  170. self.indices = indices
  171. self.values = values
  172. def __str__(self):
  173. return (
  174. f"SparseVector(num_dimensions={self.num_dimensions}, "
  175. f"size={len(self.indices)}, typecode={self.values.typecode})"
  176. )
  177. class VECTOR(types.TypeEngine):
  178. """Oracle VECTOR datatype.
  179. For complete background on using this type, see
  180. :ref:`oracle_vector_datatype`.
  181. .. versionadded:: 2.0.41
  182. """
  183. cache_ok = True
  184. __visit_name__ = "VECTOR"
  185. _typecode_map = {
  186. VectorStorageFormat.INT8: "b", # Signed int
  187. VectorStorageFormat.BINARY: "B", # Unsigned int
  188. VectorStorageFormat.FLOAT32: "f", # Float
  189. VectorStorageFormat.FLOAT64: "d", # Double
  190. }
  191. def __init__(self, dim=None, storage_format=None, storage_type=None):
  192. """Construct a VECTOR.
  193. :param dim: integer. The dimension of the VECTOR datatype. This
  194. should be an integer value.
  195. :param storage_format: VectorStorageFormat. The VECTOR storage
  196. type format. This should be Enum values form
  197. :class:`.VectorStorageFormat` INT8, BINARY, FLOAT32, or FLOAT64.
  198. :param storage_type: VectorStorageType. The Vector storage type. This
  199. should be Enum values from :class:`.VectorStorageType` SPARSE or
  200. DENSE.
  201. """
  202. if dim is not None and not isinstance(dim, int):
  203. raise TypeError("dim must be an interger")
  204. if storage_format is not None and not isinstance(
  205. storage_format, VectorStorageFormat
  206. ):
  207. raise TypeError(
  208. "storage_format must be an enum of type VectorStorageFormat"
  209. )
  210. if storage_type is not None and not isinstance(
  211. storage_type, VectorStorageType
  212. ):
  213. raise TypeError(
  214. "storage_type must be an enum of type VectorStorageType"
  215. )
  216. self.dim = dim
  217. self.storage_format = storage_format
  218. self.storage_type = storage_type
  219. def _cached_bind_processor(self, dialect):
  220. """
  221. Converts a Python-side SparseVector instance into an
  222. oracledb.SparseVectormor a compatible array format before
  223. binding it to the database.
  224. """
  225. def process(value):
  226. if value is None or isinstance(value, array.array):
  227. return value
  228. # Convert list to a array.array
  229. elif isinstance(value, list):
  230. typecode = self._array_typecode(self.storage_format)
  231. value = array.array(typecode, value)
  232. return value
  233. # Convert SqlAlchemy SparseVector to oracledb SparseVector object
  234. elif isinstance(value, SparseVector):
  235. return dialect.dbapi.SparseVector(
  236. value.num_dimensions,
  237. value.indices,
  238. value.values,
  239. )
  240. else:
  241. raise TypeError(
  242. """
  243. Invalid input for VECTOR: expected a list, an array.array,
  244. or a SparseVector object.
  245. """
  246. )
  247. return process
  248. def _cached_result_processor(self, dialect, coltype):
  249. """
  250. Converts database-returned values into Python-native representations.
  251. If the value is an oracledb.SparseVector, it is converted into the
  252. SQLAlchemy-side SparseVector class.
  253. If the value is a array.array, it is converted to a plain Python list.
  254. """
  255. def process(value):
  256. if value is None:
  257. return None
  258. elif isinstance(value, array.array):
  259. return list(value)
  260. # Convert Oracledb SparseVector to SqlAlchemy SparseVector object
  261. elif isinstance(value, dialect.dbapi.SparseVector):
  262. return SparseVector(
  263. num_dimensions=value.num_dimensions,
  264. indices=value.indices,
  265. values=value.values,
  266. )
  267. return process
  268. def _array_typecode(self, typecode):
  269. """
  270. Map storage format to array typecode.
  271. """
  272. return self._typecode_map.get(typecode, "d")
  273. class comparator_factory(types.TypeEngine.Comparator):
  274. def l2_distance(self, other):
  275. return self.op("<->", return_type=Float)(other)
  276. def inner_product(self, other):
  277. return self.op("<#>", return_type=Float)(other)
  278. def cosine_distance(self, other):
  279. return self.op("<=>", return_type=Float)(other)