basic-memory/src/basic_memory/repository/entity_repository.py at c1699df68a393c7382d5c124937b48a0cd9b449b · basicmachines-co/basic-memory · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
"""Repository for managing entities in the knowledge graph."""

from pathlib import Path
from typing import List, Optional, Sequence, Union, Any


from loguru import logger
from sqlalchemy import select, func
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from sqlalchemy.orm import selectinload
from sqlalchemy.orm.interfaces import LoaderOption
from sqlalchemy.engine import Row

from basic_memory import db
from basic_memory.models.knowledge import Entity, Observation, Relation
from basic_memory.repository.repository import Repository


class EntityRepository(Repository[Entity]):
    """Repository for Entity model.

    Note: All file paths are stored as strings in the database. Convert Path objects
    to strings before passing to repository methods.
    """

    def __init__(self, session_maker: async_sessionmaker[AsyncSession], project_id: int):
        """Initialize with session maker and project_id filter.

        Args:
            session_maker: SQLAlchemy session maker
            project_id: Project ID to filter all operations by
        """
        super().__init__(session_maker, Entity, project_id=project_id)

    async def get_by_id(self, entity_id: int) -> Optional[Entity]:  # pragma: no cover
        """Get entity by numeric ID.

        Args:
            entity_id: Numeric entity ID

        Returns:
            Entity if found, None otherwise
        """
        async with db.scoped_session(self.session_maker) as session:
            return await self.select_by_id(session, entity_id)

    async def _find_one_by_query(self, query, *, load_relations: bool) -> Optional[Entity]:
        """Return one entity row with optional eager loading."""
        if load_relations:
            return await self.find_one(query)

        result = await self.execute_query(query, use_query_options=False)
        return result.scalars().one_or_none()

    async def get_by_external_id(
        self, external_id: str, *, load_relations: bool = True
    ) -> Optional[Entity]:
        """Get entity by external UUID.

        Args:
            external_id: External UUID identifier

        Returns:
            Entity if found, None otherwise
        """
        query = self.select().where(Entity.external_id == external_id)
        return await self._find_one_by_query(query, load_relations=load_relations)

    async def get_by_permalink(
        self, permalink: str, *, load_relations: bool = True
    ) -> Optional[Entity]:
        """Get entity by permalink.

        Args:
            permalink: Unique identifier for the entity
        """
        query = self.select().where(Entity.permalink == permalink)
        return await self._find_one_by_query(query, load_relations=load_relations)

    async def get_by_title(self, title: str, *, load_relations: bool = True) -> Sequence[Entity]:
        """Get entities by title, ordered by shortest path first.

        When multiple entities share the same title (in different folders),
        returns them ordered by file_path length then alphabetically.
        This provides "shortest path" resolution for duplicate titles.

        Args:
            title: Title of the entity to find
        """
        query = (
            self.select()
            .where(Entity.title == title)
            .order_by(func.length(Entity.file_path), Entity.file_path)
        )
        result = await self.execute_query(query, use_query_options=load_relations)
        return list(result.scalars().all())

    async def get_by_file_path(
        self, file_path: Union[Path, str], *, load_relations: bool = True
    ) -> Optional[Entity]:
        """Get entity by file_path.

        Args:
            file_path: Path to the entity file (will be converted to string internally)
        """
        query = self.select().where(Entity.file_path == Path(file_path).as_posix())
        return await self._find_one_by_query(query, load_relations=load_relations)

    # -------------------------------------------------------------------------
    # Lightweight methods for permalink resolution (no eager loading)
    # -------------------------------------------------------------------------

    async def permalink_exists(self, permalink: str) -> bool:
        """Check if a permalink exists without loading the full entity.

        This is much faster than get_by_permalink() as it skips eager loading
        of observations and relations. Use for existence checks in bulk operations.

        Args:
            permalink: Permalink to check

        Returns:
            True if permalink exists, False otherwise
        """
        query = select(Entity.id).where(Entity.permalink == permalink).limit(1)
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return result.scalar_one_or_none() is not None

    async def get_file_path_for_permalink(self, permalink: str) -> Optional[str]:
        """Get the file_path for a permalink without loading the full entity.

        Use when you only need the file_path, not the full entity with relations.

        Args:
            permalink: Permalink to look up

        Returns:
            file_path string if found, None otherwise
        """
        query = select(Entity.file_path).where(Entity.permalink == permalink)
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return result.scalar_one_or_none()

    async def get_permalink_for_file_path(self, file_path: Union[Path, str]) -> Optional[str]:
        """Get the permalink for a file_path without loading the full entity.

        Use when you only need the permalink, not the full entity with relations.

        Args:
            file_path: File path to look up

        Returns:
            permalink string if found, None otherwise
        """
        query = select(Entity.permalink).where(Entity.file_path == Path(file_path).as_posix())
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return result.scalar_one_or_none()

    async def get_all_permalinks(self) -> List[str]:
        """Get all permalinks for this project.

        Optimized for bulk operations - returns only permalink strings
        without loading entities or relationships.

        Returns:
            List of all permalinks in the project
        """
        query = select(Entity.permalink)
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return list(result.scalars().all())

    async def get_permalink_to_file_path_map(self) -> dict[str, str]:
        """Get a mapping of permalink -> file_path for all entities.

        Optimized for bulk permalink resolution - loads minimal data in one query.

        Returns:
            Dict mapping permalink to file_path
        """
        query = select(Entity.permalink, Entity.file_path)
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return {row.permalink: row.file_path for row in result.all()}

    async def get_file_path_to_permalink_map(self) -> dict[str, str]:
        """Get a mapping of file_path -> permalink for all entities.

        Optimized for bulk permalink resolution - loads minimal data in one query.

        Returns:
            Dict mapping file_path to permalink
        """
        query = select(Entity.file_path, Entity.permalink)
        query = self._add_project_filter(query)
        result = await self.execute_query(query, use_query_options=False)
        return {row.file_path: row.permalink for row in result.all()}

    async def get_by_file_paths(
        self, session: AsyncSession, file_paths: Sequence[Union[Path, str]]
    ) -> List[Row[Any]]:
        """Get file paths and checksums for multiple entities (optimized for change detection).

        Only queries file_path and checksum columns, skips loading full entities and relationships.
        This is much faster than loading complete Entity objects when you only need checksums.

        Args:
            session: Database session to use for the query
            file_paths: List of file paths to query

        Returns:
            List of (file_path, checksum) tuples for matching entities
        """
        if not file_paths:  # pragma: no cover
            return []  # pragma: no cover

        # Convert all paths to POSIX strings for consistent comparison
        posix_paths = [Path(fp).as_posix() for fp in file_paths]  # pragma: no cover

        # Query ONLY file_path and checksum columns (not full Entity objects)
        query = select(Entity.file_path, Entity.checksum).where(  # pragma: no cover
            Entity.file_path.in_(posix_paths)
        )
        query = self._add_project_filter(query)  # pragma: no cover

        result = await session.execute(query)  # pragma: no cover
        return list(result.all())  # pragma: no cover

    async def find_by_checksum(self, checksum: str) -> Sequence[Entity]:
        """Find entities with the given checksum.

        Used for move detection - finds entities that may have been moved to a new path.
        Multiple entities may have the same checksum if files were copied.

        Args:
            checksum: File content checksum to search for

        Returns:
            Sequence of entities with matching checksum (may be empty)
        """
        query = self.select().where(Entity.checksum == checksum)
        # Don't load relationships for move detection - we only need file_path and checksum
        result = await self.execute_query(query, use_query_options=False)
        return list(result.scalars().all())

    async def find_by_checksums(self, checksums: Sequence[str]) -> Sequence[Entity]:
        """Find entities with any of the given checksums (batch query for move detection).

        This is a batch-optimized version of find_by_checksum() that queries multiple checksums
        in a single database query. Used for efficient move detection in cloud indexing.

        Performance: For 1000 new files, this makes 1 query vs 1000 individual queries (~100x faster).

        Example:
            When processing new files, we check if any are actually moved files by finding
            entities with matching checksums at different paths.

        Args:
            checksums: List of file content checksums to search for

        Returns:
            Sequence of entities with matching checksums (may be empty).
            Multiple entities may have the same checksum if files were copied.
        """
        if not checksums:  # pragma: no cover
            return []  # pragma: no cover

        # Query: SELECT * FROM entities WHERE checksum IN (checksum1, checksum2, ...)
        query = self.select().where(Entity.checksum.in_(checksums))  # pragma: no cover
        # Don't load relationships for move detection - we only need file_path and checksum
        result = await self.execute_query(query, use_query_options=False)  # pragma: no cover
        return list(result.scalars().all())  # pragma: no cover

    async def delete_by_file_path(self, file_path: Union[Path, str]) -> bool:
        """Delete entity with the provided file_path.

        Args:
            file_path: Path to the entity file (will be converted to string internally)
        """
        return await self.delete_by_fields(file_path=Path(file_path).as_posix())

    def get_load_options(self) -> List[LoaderOption]:
        """Get SQLAlchemy loader options for eager loading relationships."""
        return [
            selectinload(Entity.observations).selectinload(Observation.entity),
            # Load from_relations and both entities for each relation
            selectinload(Entity.outgoing_relations).selectinload(Relation.from_entity),
            selectinload(Entity.outgoing_relations).selectinload(Relation.to_entity),
            # Load to_relations and both entities for each relation
            selectinload(Entity.incoming_relations).selectinload(Relation.from_entity),
            selectinload(Entity.incoming_relations).selectinload(Relation.to_entity),
        ]

    async def find_by_permalinks(self, permalinks: List[str]) -> Sequence[Entity]:
        """Find multiple entities by their permalink.

        Args:
            permalinks: List of permalink strings to find
        """
        # Handle empty input explicitly
        if not permalinks:
            return []

        # Use existing select pattern
        query = (
            self.select().options(*self.get_load_options()).where(Entity.permalink.in_(permalinks))
        )

        result = await self.execute_query(query)
        return list(result.scalars().all())

    async def upsert_entity(self, entity: Entity) -> Entity:
        """Insert or update entity using simple try/catch with database-level conflict resolution.

        Handles file_path race conditions by checking for existing entity on IntegrityError.
        For permalink conflicts, generates a unique permalink with numeric suffix.

        Args:
            entity: The entity to insert or update

        Returns:
            The inserted or updated entity
        """
        async with db.scoped_session(self.session_maker) as session:
            # Set project_id if applicable and not already set
            self._set_project_id_if_needed(entity)

            # Try simple insert first
            try:
                session.add(entity)
                await session.flush()

                # Return with relationships loaded
                query = (
                    self.select()
                    .where(Entity.file_path == entity.file_path)
                    .options(*self.get_load_options())
                )
                result = await session.execute(query)
                found = result.scalar_one_or_none()
                if not found:  # pragma: no cover
                    raise RuntimeError(
                        f"Failed to retrieve entity after insert: {entity.file_path}"
                    )
                return found

            except IntegrityError as e:
                # Check if this is a FOREIGN KEY constraint failure
                # SQLite: "FOREIGN KEY constraint failed"
                # Postgres: "violates foreign key constraint"
                error_str = str(e)
                if (
                    "FOREIGN KEY constraint failed" in error_str
                    or "violates foreign key constraint" in error_str
                ):
                    # Import locally to avoid circular dependency (repository -> services -> repository)
                    from basic_memory.services.exceptions import SyncFatalError

                    # Project doesn't exist in database - this is a fatal sync error
                    raise SyncFatalError(
                        f"Cannot sync file '{entity.file_path}': "
                        f"project_id={entity.project_id} does not exist in database. "
                        f"The project may have been deleted. This sync will be terminated."
                    ) from e

                await session.rollback()

                # Re-query after rollback to get a fresh, attached entity
                existing_result = await session.execute(
                    select(Entity)
                    .where(
                        Entity.file_path == entity.file_path, Entity.project_id == entity.project_id
                    )
                    .options(*self.get_load_options())
                )
                existing_entity = existing_result.scalar_one_or_none()

                if existing_entity:
                    # File path conflict - update the existing entity
                    logger.debug(
                        f"Resolving file_path conflict for {entity.file_path}, "
                        f"entity_id={existing_entity.id}, observations={len(entity.observations)}"
                    )
                    # Use merge to avoid session state conflicts
                    # Set the ID to update existing entity
                    entity.id = existing_entity.id
                    # Preserve the stable external_id so that external references
                    # (e.g. public share links) survive re-indexing
                    entity.external_id = existing_entity.external_id

                    # Ensure observations reference the correct entity_id
                    for obs in entity.observations:
                        obs.entity_id = existing_entity.id
                        # Clear any existing ID to force INSERT as new observation
                        obs.id = None

                    # Merge the entity which will update the existing one
                    merged_entity = await session.merge(entity)

                    await session.commit()

                    # Re-query to get proper relationships loaded
                    final_result = await session.execute(
                        select(Entity)
                        .where(Entity.id == merged_entity.id)
                        .options(*self.get_load_options())
                    )
                    return final_result.scalar_one()

                else:
                    # No file_path conflict - must be permalink conflict
                    # Generate unique permalink and retry
                    entity = await self._handle_permalink_conflict(entity, session)
                    return entity

    async def get_all_file_paths(self) -> List[str]:
        """Get all file paths for this project - optimized for deletion detection.

        Returns only file_path strings without loading entities or relationships.
        Used by streaming sync to detect deleted files efficiently.

        Returns:
            List of file_path strings for all entities in the project
        """
        query = select(Entity.file_path)
        query = self._add_project_filter(query)

        result = await self.execute_query(query, use_query_options=False)
        return list(result.scalars().all())

    async def get_distinct_directories(self) -> List[str]:
        """Extract unique directory paths from file_path column.

        Optimized method for getting directory structure without loading full entities
        or relationships. Returns a sorted list of unique directory paths.

        Returns:
            List of unique directory paths (e.g., ["notes", "notes/meetings", "specs"])
        """
        # Query only file_path column, no entity objects or relationships
        query = select(Entity.file_path).distinct()
        query = self._add_project_filter(query)

        # Execute with use_query_options=False to skip eager loading
        result = await self.execute_query(query, use_query_options=False)
        file_paths = [row for row in result.scalars().all()]

        # Parse file paths to extract unique directories
        directories = set()
        for file_path in file_paths:
            parts = [p for p in file_path.split("/") if p]
            # Add all parent directories (exclude filename which is the last part)
            for i in range(len(parts) - 1):
                dir_path = "/".join(parts[: i + 1])
                directories.add(dir_path)

        return sorted(directories)

    async def find_by_directory_prefix(self, directory_prefix: str) -> Sequence[Entity]:
        """Find entities whose file_path starts with the given directory prefix.

        Optimized method for listing directory contents without loading all entities.
        Uses SQL LIKE pattern matching to filter entities by directory path.

        Args:
            directory_prefix: Directory path prefix (e.g., "docs", "docs/guides")
                             Empty string returns all entities (root directory)

        Returns:
            Sequence of entities in the specified directory and subdirectories
        """
        # Build SQL LIKE pattern
        if directory_prefix == "" or directory_prefix == "/":
            # Root directory - return all entities
            return await self.find_all()

        # Remove leading/trailing slashes for consistency
        directory_prefix = directory_prefix.strip("/")

        # Query entities with file_path starting with prefix
        # Pattern matches "prefix/" to ensure we get files IN the directory,
        # not just files whose names start with the prefix
        pattern = f"{directory_prefix}/%"

        query = self.select().where(Entity.file_path.like(pattern))

        # Skip eager loading - we only need basic entity fields for directory trees
        result = await self.execute_query(query, use_query_options=False)
        return list(result.scalars().all())

    async def _handle_permalink_conflict(self, entity: Entity, session: AsyncSession) -> Entity:
        """Handle permalink conflicts by generating a unique permalink."""
        base_permalink = entity.permalink
        suffix = 1

        # Find a unique permalink
        while True:
            test_permalink = f"{base_permalink}-{suffix}"
            existing = await session.execute(
                select(Entity).where(
                    Entity.permalink == test_permalink, Entity.project_id == entity.project_id
                )
            )
            if existing.scalar_one_or_none() is None:
                # Found unique permalink
                entity.permalink = test_permalink
                break
            suffix += 1

        # Insert with unique permalink
        session.add(entity)
        try:
            await session.flush()
        except IntegrityError as e:  # pragma: no cover
            # Check if this is a FOREIGN KEY constraint failure
            # SQLite: "FOREIGN KEY constraint failed"
            # Postgres: "violates foreign key constraint"
            error_str = str(e)
            if (
                "FOREIGN KEY constraint failed" in error_str
                or "violates foreign key constraint" in error_str
            ):
                # Import locally to avoid circular dependency (repository -> services -> repository)
                from basic_memory.services.exceptions import SyncFatalError

                # Project doesn't exist in database - this is a fatal sync error
                raise SyncFatalError(  # pragma: no cover
                    f"Cannot sync file '{entity.file_path}': "
                    f"project_id={entity.project_id} does not exist in database. "
                    f"The project may have been deleted. This sync will be terminated."
                ) from e
            # Re-raise if not a foreign key error
            raise  # pragma: no cover
        return entity