From be4fd966030381cd974192e2e7e52b214c433af3 Mon Sep 17 00:00:00 2001 From: DhruvGorasiya Date: Mon, 15 Dec 2025 13:34:39 -0500 Subject: [PATCH] updated docs and migration script --- assets/migrate_weaviate_collections.py | 303 ++++++++++-------- .../troubleshooting/weaviate-v4-migration.mdx | 263 ++++++++++++--- 2 files changed, 379 insertions(+), 187 deletions(-) diff --git a/assets/migrate_weaviate_collections.py b/assets/migrate_weaviate_collections.py index 4631cae7..f0bb95f6 100644 --- a/assets/migrate_weaviate_collections.py +++ b/assets/migrate_weaviate_collections.py @@ -1,15 +1,22 @@ -#!/usr/bin/env python3 """ Migration script to fix Weaviate schema incompatibility between 1.19.0 and 1.27.0+ - This script: - Identifies collections with old schema (no vectorConfig) - Creates new collections with proper vectorConfig including "default" named vector - Migrates data using cursor-based pagination (efficient for large datasets) - Uses batch operations for fast inserts - Preserves all object properties and vectors +Note: +- This is a community-edited version of the draft of the script presented by the Dify Team. +- This script is not officially supported by the Dify Team. +- The original source for this script can be found at https://github.com/langgenius/dify/issues/27291#issuecomment-3501003678. +- The changes made in this script are: + - Retrieve Weaviate connection info from environment variables to make this script run in the Worker container. + - Switch to cursor-based pagination in "replace_old_collection", since the migration could fail with large collections. + - Fix an issue where both the old and new collections remained without being deleted after migrating an empty collection. """ +import os import weaviate from weaviate.classes.config import Configure, VectorDistances import sys @@ -17,62 +24,68 @@ import time from typing import List, Dict, Any # Configuration -WEAVIATE_HOST = "localhost" -WEAVIATE_PORT = 8080 -WEAVIATE_GRPC_PORT = 50051 -WEAVIATE_API_KEY = "WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih" -BATCH_SIZE = 100 +WEAVIATE_ENDPOINT = os.getenv("WEAVIATE_ENDPOINT", "http://weaviate:8080") +WEAVIATE_GRPC_ENDPOINT = os.getenv("WEAVIATE_GRPC_ENDPOINT", "grpc://weaviate:50051") +WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih") +BATCH_SIZE = 1000 +WEAVIATE_HOST = WEAVIATE_ENDPOINT.split("//")[-1].split(":")[0] +WEAVIATE_PORT = int(WEAVIATE_ENDPOINT.split(":")[-1]) +WEAVIATE_GRPC_PORT = int(WEAVIATE_GRPC_ENDPOINT.split(":")[-1]) def identify_old_collections(client: weaviate.WeaviateClient) -> List[str]: """Identify collections that need migration (those without vectorConfig)""" collections_to_migrate = [] - + all_collections = client.collections.list_all() print(f"Found {len(all_collections)} total collections") - + for collection_name in all_collections.keys(): # Only check Vector_index collections (Dify knowledge bases) if not collection_name.startswith("Vector_index_"): continue - + collection = client.collections.get(collection_name) config = collection.config.get() - + # Check if this collection has the old schema if config.vector_config is None: collections_to_migrate.append(collection_name) print(f" - {collection_name}: OLD SCHEMA (needs migration)") else: print(f" - {collection_name}: NEW SCHEMA (skip)") - + return collections_to_migrate -def get_collection_schema(client: weaviate.WeaviateClient, collection_name: str) -> Dict[str, Any]: +def get_collection_schema( + client: weaviate.WeaviateClient, collection_name: str +) -> Dict[str, Any]: """Get the full schema of a collection via REST API""" import requests - + response = requests.get( f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{collection_name}", - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) - + if response.status_code == 200: return response.json() else: raise Exception(f"Failed to get schema: {response.text}") -def create_new_collection(client: weaviate.WeaviateClient, old_name: str, schema: Dict[str, Any]) -> str: +def create_new_collection( + client: weaviate.WeaviateClient, old_name: str, schema: Dict[str, Any] +) -> str: """Create a new collection with updated schema using REST API""" import requests - + # Generate new collection name new_name = f"{old_name}_migrated" - + print(f"Creating new collection: {new_name}") - + # Build new schema with proper vectorConfig # Note: When using vectorConfig (named vectors), we don't set class-level vectorizer new_schema = { @@ -81,91 +94,88 @@ def create_new_collection(client: weaviate.WeaviateClient, old_name: str, schema # Do NOT set class-level vectorizer when using vectorConfig "vectorConfig": { "default": { - "vectorizer": { - "none": {} - }, + "vectorizer": {"none": {}}, "vectorIndexType": "hnsw", "vectorIndexConfig": { "distance": "cosine", "ef": -1, "efConstruction": 128, - "maxConnections": 32 - } + "maxConnections": 32, + }, } }, - "properties": [] + "properties": [], } - + # Copy properties from old schema if "properties" in schema: new_schema["properties"] = schema["properties"] - + # Create collection via REST API response = requests.post( - f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema", + f"{WEAVIATE_ENDPOINT}/v1/schema", json=new_schema, - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) - + if response.status_code not in [200, 201]: raise Exception(f"Failed to create collection: {response.text}") - + print(f" Created new collection: {new_name}") return new_name def migrate_collection_data( - client: weaviate.WeaviateClient, - old_collection_name: str, - new_collection_name: str + client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str ) -> int: """Migrate data from old collection to new collection using cursor-based pagination""" - + old_collection = client.collections.get(old_collection_name) new_collection = client.collections.get(new_collection_name) - + total_migrated = 0 cursor = None - + print(f"Migrating data from {old_collection_name} to {new_collection_name}") - + while True: # Fetch batch of objects using cursor-based pagination if cursor is None: # First batch response = old_collection.query.fetch_objects( - limit=BATCH_SIZE, - include_vector=True + limit=BATCH_SIZE, include_vector=True ) else: # Subsequent batches using cursor response = old_collection.query.fetch_objects( - limit=BATCH_SIZE, - include_vector=True, - after=cursor + limit=BATCH_SIZE, include_vector=True, after=cursor ) - + objects = response.objects - + if not objects: break - + # Use batch insert for efficiency with new_collection.batch.dynamic() as batch: for obj in objects: # Prepare properties properties = obj.properties - + # Add object with vector batch.add_object( properties=properties, - vector=obj.vector["default"] if isinstance(obj.vector, dict) else obj.vector, - uuid=obj.uuid + vector=( + obj.vector["default"] + if isinstance(obj.vector, dict) + else obj.vector + ), + uuid=obj.uuid, ) - + total_migrated += len(objects) print(f" Migrated {total_migrated} objects...") - + # Update cursor for next iteration if len(objects) < BATCH_SIZE: # Last batch @@ -173,36 +183,34 @@ def migrate_collection_data( else: # Get the last object's UUID for cursor cursor = objects[-1].uuid - + print(f" Total migrated: {total_migrated} objects") return total_migrated def verify_migration( - client: weaviate.WeaviateClient, - old_collection_name: str, - new_collection_name: str + client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str ): """Verify that the migration was successful""" - + old_collection = client.collections.get(old_collection_name) new_collection = client.collections.get(new_collection_name) - + # Count objects in both collections old_count_response = old_collection.query.fetch_objects(limit=1) new_count_response = new_collection.query.fetch_objects(limit=1) - + # Get aggregation for accurate counts old_agg = old_collection.aggregate.over_all(total_count=True) new_agg = new_collection.aggregate.over_all(total_count=True) - + old_count = old_agg.total_count new_count = new_agg.total_count - + print(f"\nVerification:") print(f" Old collection ({old_collection_name}): {old_count} objects") print(f" New collection ({new_collection_name}): {new_count} objects") - + if old_count == new_count: print(f" Status: SUCCESS - Counts match!") return True @@ -212,109 +220,132 @@ def verify_migration( def replace_old_collection( - client: weaviate.WeaviateClient, - old_collection_name: str, - new_collection_name: str + client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str ): """Replace old collection with migrated one by recreating with original name""" import requests - + print(f"\nReplacing old collection with migrated data...") - - # Step 1: Get data from migrated collection - print(f" Step 1: Getting data from migrated collection...") - migrated = client.collections.get(new_collection_name) - objects = migrated.query.fetch_objects(include_vector=True, limit=10000) - print(f" Found {len(objects.objects)} objects") - - # Step 2: Delete old collection - print(f" Step 2: Deleting old collection...") + + # Step 1: Delete old collection + print(f" Step 1: Deleting old collection...") response = requests.delete( - f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{old_collection_name}", - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + f"{WEAVIATE_ENDPOINT}/v1/schema/{old_collection_name}", + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) if response.status_code != 200: print(f" Warning: Could not delete old collection: {response.text}") else: print(f" Deleted") - - # Step 3: Get schema from migrated collection - print(f" Step 3: Getting schema from migrated collection...") + + # Step 2: Get schema from migrated collection + print(f" Step 2: Getting schema from migrated collection...") schema_response = requests.get( - f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{new_collection_name}", - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + f"{WEAVIATE_ENDPOINT}/v1/schema/{new_collection_name}", + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) schema = schema_response.json() schema["class"] = old_collection_name - - # Step 4: Create collection with original name and new schema - print(f" Step 4: Creating collection with original name...") + + # Step 3: Create collection with original name and new schema + print(f" Step 3: Creating collection with original name...") create_response = requests.post( - f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema", + f"{WEAVIATE_ENDPOINT}/v1/schema", json=schema, - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) if create_response.status_code not in [200, 201]: raise Exception(f"Failed to create collection: {create_response.text}") print(f" Created") - - # Step 5: Copy data to collection with original name - print(f" Step 5: Copying data to original collection name...") + + # Step 4: Copy data to collection with original name using cursor-based pagination + print(f" Step 4: Copying data to original collection name...") + migrated_collection = client.collections.get(new_collection_name) new_collection = client.collections.get(old_collection_name) - - with new_collection.batch.dynamic() as batch: - for obj in objects.objects: - batch.add_object( - properties=obj.properties, - vector=obj.vector, - uuid=obj.uuid + + total_copied = 0 + cursor = None + + while True: + # Fetch batch of objects using cursor-based pagination + if cursor is None: + # First batch + response = migrated_collection.query.fetch_objects( + include_vector=True, limit=BATCH_SIZE ) - - count = new_collection.aggregate.over_all(total_count=True).total_count - print(f" Copied {count} objects") - - # Step 6: Delete the temporary migrated collection - print(f" Step 6: Cleaning up temporary migrated collection...") + else: + # Subsequent batches using cursor + response = migrated_collection.query.fetch_objects( + include_vector=True, limit=BATCH_SIZE, after=cursor + ) + + objects = response.objects + + if not objects: + break + + # Use batch insert for efficiency + with new_collection.batch.dynamic() as batch: + for obj in objects: + batch.add_object( + properties=obj.properties, vector=obj.vector, uuid=obj.uuid + ) + + total_copied += len(objects) + print(f" Copied {total_copied} objects...") + + # Update cursor for next iteration + if len(objects) < BATCH_SIZE: + break + else: + cursor = objects[-1].uuid + + print(f" Total copied: {total_copied} objects") + + # Step 5: Delete the temporary migrated collection + print(f" Step 5: Cleaning up temporary migrated collection...") response = requests.delete( - f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{new_collection_name}", - headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"} + f"{WEAVIATE_ENDPOINT}/v1/schema/{new_collection_name}", + headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}, ) if response.status_code == 200: print(f" Cleaned up") - - print(f"\n SUCCESS! {old_collection_name} now has the new schema with {count} objects") + + print( + f"\n SUCCESS! {old_collection_name} now has the new schema with {total_copied} objects" + ) return True def migrate_all_collections(): """Main migration function""" - + print("=" * 80) print("Weaviate Collection Migration Script") print("Migrating from Weaviate 1.19.0 schema to 1.27.0+ schema") print("=" * 80) print() - + client = weaviate.connect_to_local( host=WEAVIATE_HOST, port=WEAVIATE_PORT, grpc_port=WEAVIATE_GRPC_PORT, - auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY) + auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY), ) - + try: # Step 1: Identify collections that need migration print("Step 1: Identifying collections that need migration...") collections_to_migrate = identify_old_collections(client) - + if not collections_to_migrate: print("\nNo collections need migration. All collections are up to date!") return - + print(f"\nFound {len(collections_to_migrate)} collections to migrate:") for col in collections_to_migrate: print(f" - {col}") - + # Confirm before proceeding print("\nThis script will:") print("1. Create new collections with updated schema") @@ -322,54 +353,58 @@ def migrate_all_collections(): print("3. Verify the migration") print("4. Optionally rename collections to activate the new ones") print() - + # Step 2: Migrate each collection for collection_name in collections_to_migrate: print("\n" + "=" * 80) print(f"Migrating: {collection_name}") print("=" * 80) - + try: # Get old schema schema = get_collection_schema(client, collection_name) - + # Create new collection - new_collection_name = create_new_collection(client, collection_name, schema) - + new_collection_name = create_new_collection( + client, collection_name, schema + ) + # Migrate data - migrated_count = migrate_collection_data(client, collection_name, new_collection_name) - + migrated_count = migrate_collection_data( + client, collection_name, new_collection_name + ) + # Verify migration success = verify_migration(client, collection_name, new_collection_name) - - if success and migrated_count > 0: + + if success: print(f"\nMigration successful for {collection_name}!") print(f"New collection: {new_collection_name}") - + # Automatically replace old collection with migrated one try: - replace_old_collection(client, collection_name, new_collection_name) + replace_old_collection( + client, collection_name, new_collection_name + ) except Exception as e: - print(f"\nWarning: Could not automatically replace collection: {e}") + print( + f"\nWarning: Could not automatically replace collection: {e}" + ) print(f"\nTo activate manually:") print(f"1. Delete the old collection: {collection_name}") print(f"2. Rename {new_collection_name} to {collection_name}") - + except Exception as e: print(f"\nError migrating {collection_name}: {e}") print(f"Skipping this collection and continuing...") continue - + print("\n" + "=" * 80) print("Migration Complete!") print("=" * 80) print("\nSummary:") print(f" Collections migrated: {len(collections_to_migrate)}") - print(f"\nNext steps:") - print(f"1. Test the new collections (*_migrated)") - print(f"2. If everything works, delete or backup the old collections") - print(f"3. Rename the new collections to remove '_migrated' suffix") - + finally: client.close() @@ -383,6 +418,6 @@ if __name__ == "__main__": except Exception as e: print(f"\n\nFatal error: {e}") import traceback + traceback.print_exc() sys.exit(1) - diff --git a/en/self-host/troubleshooting/weaviate-v4-migration.mdx b/en/self-host/troubleshooting/weaviate-v4-migration.mdx index 8a2bb2f6..4e576fdd 100644 --- a/en/self-host/troubleshooting/weaviate-v4-migration.mdx +++ b/en/self-host/troubleshooting/weaviate-v4-migration.mdx @@ -6,20 +6,25 @@ title: Weaviate Migration Guide upgrading to Client v4 and Server 1.27+ ## Overview -Starting with **Dify v1.9.2**, the weaviate-client has been upgraded from v3 to v4.17.0. This upgrade brings significant performance improvements and better stability, but requires **Weaviate server version 1.27.0 or higher**. +Starting with an upcoming Dify release, the weaviate-client has been upgraded from v3 to v4.17.0. This upgrade brings significant performance improvements and better stability, but requires **Weaviate server version 1.27.0 or higher**. -**BREAKING CHANGE:** The new weaviate-client v4 is NOT backward compatible with Weaviate server versions below 1.27.0. If you are running a self-hosted Weaviate instance on version 1.19.0 or older, you must upgrade your Weaviate server before upgrading Dify. + **BREAKING CHANGE:** The new weaviate-client v4 is NOT backward compatible + with Weaviate server versions below 1.27.0. If you are running a self-hosted + Weaviate instance on version 1.19.0 or older, you must upgrade your Weaviate + server before upgrading Dify. ### Who Is Affected? This migration affects: + - Self-hosted Dify users running their own Weaviate instances on versions below 1.27.0 - Users currently on Weaviate server version 1.19.0-1.26.x - Users upgrading to Dify versions with weaviate-client v4 **Not affected:** + - Cloud-hosted Weaviate users (Weaviate Cloud manages the server version) - Users already on Weaviate 1.27.0+ can upgrade Dify without additional steps - Users running Dify's default Docker Compose setup (Weaviate version is updated automatically) @@ -44,17 +49,21 @@ The weaviate-client v4 introduces several breaking changes: ## Version Compatibility Matrix -| Dify Version | Weaviate-client Version | Compatible Weaviate Server Versions | -|--------------|-------------------------|-------------------------------------| +| Dify Version | weaviate-client Version | Compatible Weaviate Server Versions | +| ------------ | ----------------------- | ----------------------------------- | | ≤ 1.9.1 | v3.x | 1.19.0 - 1.26.x | -| ≥ 1.9.2 | v4.17.0 | 1.27.0+ (tested up to 1.33.1) | +| ≥ 1.9.2\* | v4.17.0 | 1.27.0+ (tested up to 1.33.1) | -This migration applies to any Dify version using weaviate-client v4.17.0 or higher. + *The exact Dify version with weaviate-client v4 may vary. Check the release + notes for your specific version. This migration applies to any Dify version + using weaviate-client v4.17.0 or higher. -Weaviate server version 1.19.0 was released over a year ago and is now outdated. Upgrading to 1.27.0+ provides access to numerous improvements in performance, stability, and features. + Weaviate server version 1.19.0 was released over a year ago and is now + outdated. Upgrading to 1.27.0+ provides access to numerous improvements in + performance, stability, and features. ## Prerequisites @@ -62,17 +71,21 @@ Weaviate server version 1.19.0 was released over a year ago and is now outdated. Before starting the migration, complete these steps: 1. **Check Your Current Weaviate Version** + ```bash curl http://localhost:8080/v1/meta ``` + Look for the `version` field in the response. 2. **Backup Your Data** + - Create a complete backup of your Weaviate data - Backup your Docker volumes if using Docker Compose - Document your current configuration settings 3. **Review System Requirements** + - Ensure sufficient disk space for database migration - Verify network connectivity between Dify and Weaviate - Confirm gRPC port (50051) is accessible if using external Weaviate @@ -92,38 +105,40 @@ Choose the migration path that matches your deployment setup and current Weaviat - **Path B – Direct Recovery (already on 1.27+):** Use this if you already upgraded to 1.27+ and your knowledge bases stopped working. This path focuses on repairing the data layout and running the schema migration. -Do **not** attempt to downgrade back to 1.19. The schema format is incompatible and will lead to data loss. + Do **not** attempt to downgrade back to 1.19. The schema format is + incompatible and will lead to data loss. ### Path A: Migration with Backup (From 1.19) -Safest path. Creates a backup before upgrading so you can restore if anything goes wrong. + Safest path. Creates a backup before upgrading so you can restore if anything + goes wrong. #### Prerequisites - Currently running Weaviate 1.19 - Docker + Docker Compose installed -- Python 3.11+ available for the [schema migration script](https://github.com/langgenius/dify-docs/blob/main/assets/migrate_weaviate_collections.py) +- Python 3.11+ available for the schema migration script #### Step A1: Enable the Backup Module on Weaviate 1.19 Edit `docker/docker-compose.yaml` so the `weaviate` service includes backup configuration: ```yaml - weaviate: - image: semitechnologies/weaviate:1.19.0 - volumes: - - ./volumes/weaviate:/var/lib/weaviate - - ./volumes/weaviate_backups:/var/lib/weaviate/backups - ports: - - "8080:8080" - - "50051:50051" - environment: - ENABLE_MODULES: backup-filesystem - BACKUP_FILESYSTEM_PATH: /var/lib/weaviate/backups - # ... rest of your environment variables +weaviate: + image: semitechnologies/weaviate:1.19.0 + volumes: + - ./volumes/weaviate:/var/lib/weaviate + - ./volumes/weaviate_backups:/var/lib/weaviate/backups + ports: + - "8080:8080" + - "50051:50051" + environment: + ENABLE_MODULES: backup-filesystem + BACKUP_FILESYSTEM_PATH: /var/lib/weaviate/backups + # ... rest of your environment variables ``` Restart Weaviate to apply the change: @@ -206,6 +221,10 @@ sleep 10 #### Step A4: Fix Orphaned LSM Data (if present) +You can fix orphaned LSM data either from the host or inside the container: + +**Option A: From host (if volumes are mounted):** + ```bash cd docker/volumes/weaviate @@ -226,6 +245,32 @@ docker compose restart weaviate sleep 15 ``` +**Option B: Inside Weaviate container (recommended):** + +```bash +cd /path/to/dify/docker +docker compose exec -it weaviate /bin/sh + +# Inside container +cd /var/lib/weaviate +for dir in vector_index_*_node_*_lsm; do + [ -d "$dir" ] || continue + + index_id=$(echo "$dir" | sed -n 's/vector_index_\([^_]*_[^_]*_[^_]*_[^_]*_[^_]*\)_node_.*/\1/p') + shard_id=$(echo "$dir" | sed -n 's/.*_node_\([^_]*\)_lsm/\1/p') + + mkdir -p "vector_index_${index_id}_node/$shard_id/lsm" + cp -a "$dir/"* "vector_index_${index_id}_node/$shard_id/lsm/" + + echo "✓ Copied $dir" +done +exit + +# Restart Weaviate +docker compose restart weaviate +sleep 15 +``` + #### Step A5: Migrate the Schema 1. **Install dependencies** (in a temporary virtualenv is fine): @@ -237,12 +282,39 @@ sleep 15 pip install weaviate-client requests ``` -2. **Run the [migration script](https://github.com/langgenius/dify-docs/blob/main/assets/migrate_weaviate_collections.py):** +2. **Run the migration script:** + + You can run the script either locally or inside the Worker container: + + **Option A: Run locally (if you have Python 3.11+ and dependencies installed):** ```bash python3 migrate_weaviate_collections.py ``` + **Option B: Run inside Worker container (recommended for Docker setups):** + + ```bash + # Copy script to storage directory + cp migrate_weaviate_collections.py /path/to/dify/docker/volumes/app/storage/ + + # Enter worker container + cd /path/to/dify/docker + docker compose exec -it worker /bin/bash + + # Run migration script (use --no-cache for Dify 1.11.0+) + uv run --no-cache /app/api/storage/migrate_weaviate_collections.py + + # Exit container + exit + ``` + + + The migration script uses environment variables for configuration, making + it suitable for running inside Docker containers. For Dify 1.11.0+, if you + encounter permission errors with `uv`, use `uv run --no-cache` instead. + + 3. **Restart Dify services:** ```bash @@ -253,28 +325,40 @@ sleep 15 4. **Verify in the UI:** open Dify, test retrieval against your migrated knowledge bases. + + For large collections (over 10,000 objects), verify that the object count + matches between old and new collections. The migration script will display + verification counts automatically. + + -After confirming a healthy migration, you can delete `weaviate_migration_env` and the backup files to reclaim disk space. + After confirming a healthy migration, you can delete `weaviate_migration_env` + and the backup files to reclaim disk space. ### Path B: Direct Recovery (Already on 1.27+) -Only use this path if you already upgraded to 1.27+ and your knowledge bases stopped working. You cannot create a 1.19 backup anymore, so you must repair the data in place. + Only use this path if you already upgraded to 1.27+ and your knowledge bases + stopped working. You cannot create a 1.19 backup anymore, so you must repair + the data in place. #### Prerequisites - Currently running Weaviate 1.27+ (including 1.33) - Docker + Docker Compose installed -- Python 3.11+ for the [migration script](https://github.com/langgenius/dify-docs/blob/main/assets/migrate_weaviate_collections.py) +- Python 3.11+ for the migration script #### Step B1: Repair Orphaned LSM Data +Stop Weaviate and fix orphaned LSM data: + ```bash -cd docker +cd /path/to/dify/docker docker compose stop weaviate +# Option A: From host (if volumes are mounted) cd volumes/weaviate for dir in vector_index_*_node_*_lsm; do @@ -288,12 +372,24 @@ for dir in vector_index_*_node_*_lsm; do echo "✓ Copied $dir" done + +# Option B: Inside container (recommended) +docker compose run --rm --entrypoint /bin/sh weaviate -c " +cd /var/lib/weaviate +for dir in vector_index_*_node_*_lsm; do + [ -d \"\$dir\" ] || continue + index_id=\$(echo \"\$dir\" | sed -n 's/vector_index_\([^_]*_[^_]*_[^_]*_[^_]*_[^_]*\)_node_.*/\1/p') + shard_id=\$(echo \"\$dir\" | sed -n 's/.*_node_\([^_]*\)_lsm/\1/p') + mkdir -p \"vector_index_\${index_id}_node/\$shard_id/lsm\" + cp -a \"\$dir/\"* \"vector_index_\${index_id}_node/\$shard_id/lsm/\" + echo \"✓ Copied \$dir\" +done +" ``` Restart Weaviate: ```bash -cd ../.. docker compose start weaviate sleep 15 ``` @@ -316,7 +412,30 @@ curl -s -H "Authorization: Bearer " \ #### Step B2: Run the Schema Migration -Follow the same commands as [Step A5](#step-a5:-migrate-the-schema). Create the virtualenv if needed, install `weaviate-client` 4.x, run `migrate_weaviate_collections.py`, then restart `api`, `worker`, and `worker_beat`. +Follow the same commands as [Step A5](#step-a5-migrate-the-schema). You can run the script locally or inside the Worker container: + +**To run inside Worker container:** + +```bash +# Copy script to storage directory +cp migrate_weaviate_collections.py /path/to/dify/docker/volumes/app/storage/ + +# Enter worker container +cd /path/to/dify/docker +docker compose exec -it worker /bin/bash + +# Run migration script +uv run --no-cache /app/api/storage/migrate_weaviate_collections.py + +# Exit and restart services +exit +docker compose restart api worker worker_beat +``` + + + The migration script uses cursor-based pagination to safely handle large + collections. Verify object counts match after migration completes. + #### Step B3: Verify in Dify @@ -332,6 +451,7 @@ Follow the same commands as [Step A5](#step-a5:-migrate-the-schema). Create the **Your existing knowledge bases will NOT work after upgrade without migration!** ### Why Migration is Needed: + - Old data: Created with Weaviate v3 client (simple schema) - New code: Requires Weaviate v4 format (extended schema) - **Incompatible**: Old data missing required properties @@ -343,6 +463,7 @@ Follow the same commands as [Step A5](#step-a5:-migrate-the-schema). Create the ##### Option B: Re-index from Original Documents ##### Option C: Keep Old Weaviate (Don't Upgrade Yet) If you can't afford downtime or data loss. + ### Automatic Migration @@ -379,7 +500,9 @@ curl -X POST "http://localhost:8080/v1/backups/filesystem/pre-migration-backup/r ``` -For comprehensive migration guidance, especially for complex schemas or large datasets, refer to the official [Weaviate Migration Guide](https://weaviate.io/developers/weaviate/installation/migration). + For comprehensive migration guidance, especially for complex schemas or large + datasets, refer to the official [Weaviate Migration + Guide](https://weaviate.io/developers/weaviate/installation/migration). ## Configuration Changes @@ -395,10 +518,12 @@ The following new environment variable is available in Dify versions with weavia **Format:** `hostname:port` (NO protocol prefix) **Default Ports:** + - Insecure: 50051 - Secure (TLS): 443 **Examples:** + ```bash # Docker Compose (internal network) WEAVIATE_GRPC_ENDPOINT=weaviate:50051 @@ -414,7 +539,8 @@ WEAVIATE_GRPC_ENDPOINT=your-instance.weaviate.cloud:443 ``` -Do NOT include protocol prefixes like `grpc://` or `http://` in the WEAVIATE_GRPC_ENDPOINT value. Use only `hostname:port`. + Do NOT include protocol prefixes like `grpc://` or `http://` in the + WEAVIATE_GRPC_ENDPOINT value. Use only `hostname:port`. ### Updated Environment Variables @@ -446,8 +572,6 @@ WEAVIATE_GRPC_ENDPOINT=weaviate:50051 WEAVIATE_BATCH_SIZE=100 ``` - - ## Verification Steps After completing the migration, verify everything is working correctly: @@ -483,7 +607,8 @@ Look for messages indicating successful connection without "No module named 'wea 6. Check that status changes from "QUEUING" → "INDEXING" → "AVAILABLE" -If documents get stuck in "QUEUING" status, check that the Celery worker is running: `docker compose logs worker` + If documents get stuck in "QUEUING" status, check that the Celery worker is + running: `docker compose logs worker` ### 4. Test Vector Search @@ -506,7 +631,8 @@ docker compose logs -f api | grep -i "query_time\|duration" ``` -With gRPC properly configured, vector search queries should be 2-5x faster compared to HTTP-only connections. + With gRPC properly configured, vector search queries should be 2-5x faster + compared to HTTP-only connections. ## Troubleshooting @@ -516,6 +642,7 @@ With gRPC properly configured, vector search queries should be 2-5x faster compa **Cause:** The weaviate-client v4 is not installed, or v3 is still being used. **Solution:** + ```bash # For Docker installations, ensure you're running the correct Dify version docker compose pull @@ -537,20 +664,22 @@ pip install weaviate-client==4.17.0 The port is available internally between containers. No action needed unless you're connecting from outside Docker. 2. **For external Weaviate:** + ```bash # Check if Weaviate is listening on 50051 docker ps | grep weaviate # Look for "0.0.0.0:50051->50051/tcp" - + # If not exposed, restart with port mapping docker run -p 8080:8080 -p 50051:50051 ... ``` 3. **Check firewall rules:** + ```bash # Linux sudo ufw allow 50051/tcp - + # Check if port is listening netstat -tlnp | grep 50051 ``` @@ -562,23 +691,25 @@ pip install weaviate-client==4.17.0 **Solution:** 1. Verify API key matches in both Weaviate and Dify: + ```bash # Check Weaviate authentication curl http://localhost:8080/v1/meta | jq '.authentication' - + # Check Dify configuration docker compose exec api env | grep WEAVIATE_API_KEY ``` 2. If using anonymous access: + ```yaml # Weaviate docker-compose.yaml weaviate: environment: - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - AUTHENTICATION_APIKEY_ENABLED: 'false' + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true" + AUTHENTICATION_APIKEY_ENABLED: "false" ``` - + Then remove `WEAVIATE_API_KEY` from Dify configuration. ### Issue: Documents Stuck in "QUEUING" Status @@ -609,17 +740,20 @@ docker compose restart worker **Solution:** 1. Verify gRPC configuration: + ```bash docker compose exec api env | grep WEAVIATE_GRPC ``` - + Should show: + ``` WEAVIATE_GRPC_ENABLED=true WEAVIATE_GRPC_ENDPOINT=weaviate:50051 ``` 2. Test gRPC connectivity: + ```bash docker exec -it dify-api-1 nc -zv weaviate 50051 # Should return "succeeded" @@ -634,16 +768,19 @@ docker compose restart worker **Solution:** 1. Check Weaviate logs for specific error messages: + ```bash docker compose logs weaviate | tail -100 ``` 2. List current schema: + ```bash curl http://localhost:8080/v1/schema ``` 3. If necessary, delete corrupted collections (⚠️ this deletes all data): + ```bash # Backup first! curl -X DELETE http://localhost:8080/v1/schema/YourCollectionName @@ -655,7 +792,8 @@ docker compose restart worker ``` -Deleting collections removes all data. Only do this if you have a backup and are prepared to re-index all content. + Deleting collections removes all data. Only do this if you have a backup and + are prepared to re-index all content. ### Issue: Docker Volume Permission Errors @@ -663,6 +801,7 @@ Deleting collections removes all data. Only do this if you have a backup and are **Cause:** User ID mismatch in Docker containers. **Solution:** + ```bash # Check ownership of Weaviate data directory ls -la docker/volumes/weaviate/ @@ -674,6 +813,21 @@ sudo chown -R 1000:1000 docker/volumes/weaviate/ docker compose restart weaviate ``` +### Issue: Permission Denied When Running Migration Script (Dify 1.11.0+) + +**Cause:** The `/home/dify` directory may not exist in newer Dify versions, causing `uv` cache creation to fail. + +**Solution:** + +```bash +# Option 1: Use --no-cache flag (recommended) +uv run --no-cache migrate_weaviate_collections.py + +# Option 2: Run as root user +docker compose exec -u root worker /bin/bash +uv run migrate_weaviate_collections.py +``` + ## Rollback Plan If the migration fails and you need to rollback: @@ -724,7 +878,8 @@ docker compose logs | grep -i error ``` -Always test the rollback procedure in a staging environment first if possible. Maintain multiple backup copies before attempting major migrations. + Always test the rollback procedure in a staging environment first if possible. + Maintain multiple backup copies before attempting major migrations. ## Additional Resources @@ -734,15 +889,15 @@ Always test the rollback procedure in a staging environment first if possible. M - [Weaviate Migration Guide](https://weaviate.io/developers/weaviate/installation/migration) - [Weaviate v4 Client Documentation](https://weaviate.io/developers/weaviate/client-libraries/python) - [Weaviate Backup and Restore](https://weaviate.io/developers/weaviate/configuration/backups) -- [Dify Self-Hosting Guide](/en/self-host/quick-start/docker-compose) -- [Dify Environment Variables](/en/self-host/configuration/environments) +- [Dify Self-Hosting Guide](/en/getting-started/install-self-hosted/docker-compose) +- [Dify Environment Variables](/en/getting-started/install-self-hosted/environments) ### Community Resources - [Dify GitHub Repository](https://github.com/langgenius/dify) - [Dify GitHub Issues - Weaviate](https://github.com/langgenius/dify/issues?q=is%3Aissue+weaviate) - [Weaviate Community Forum](https://forum.weaviate.io/) -- [Dify Community Forum](https://forum.dify.ai/) +- [Dify Discord Community](https://discord.gg/dify) ### Migration Tools @@ -753,16 +908,18 @@ Always test the rollback procedure in a staging environment first if possible. M This migration brings important improvements to Dify's vector storage capabilities: - **Better Performance:** gRPC support dramatically improves query and import speeds (2-5x faster) +**Better Performance:** gRPC support dramatically improves query and import speeds (2-5x faster) - **Improved Stability:** Enhanced connection handling and error recovery +**Improved Stability:** Enhanced connection handling and error recovery - **Security:** Access to security updates and patches not available in Weaviate 1.19.0 +**Security:** Access to security updates and patches not available in Weaviate 1.19.0 - **Future-Proof:** Access to latest Weaviate features and ongoing support +**Future-Proof:** Access to latest Weaviate features and ongoing support While this is a breaking change requiring server upgrade for users on old versions, the benefits significantly outweigh the migration effort. Most Docker Compose users can complete the migration in under 15 minutes with the automatic update. -If you encounter any issues not covered in this guide, please report them on the [Dify GitHub Issues page](https://github.com/langgenius/dify/issues) with the label "weaviate" and "migration". - \ No newline at end of file + If you encounter any issues not covered in this guide, please report them on + the [Dify GitHub Issues page](https://github.com/langgenius/dify/issues) with + the label "weaviate" and "migration". +