Update the weaviate v4 migration docs and script (#616)

* updated docs and migration script

* update the dify version, add the migration script link, and improve formatting

---------

Co-authored-by: DhruvGorasiya <Dhruv.Gorasiya@student.csulb.edu>
Co-authored-by: kurokobo <2920259+kurokobo@users.noreply.github.com>
Co-authored-by: Riskey <riskey47@dify.ai>
This commit is contained in:
Riskey
2025-12-16 13:52:21 +08:00
committed by GitHub
parent f258cdffcb
commit 212e616a02
2 changed files with 396 additions and 232 deletions

View File

@@ -1,15 +1,22 @@
#!/usr/bin/env python3
"""
Migration script to fix Weaviate schema incompatibility between 1.19.0 and 1.27.0+
This script:
- Identifies collections with old schema (no vectorConfig)
- Creates new collections with proper vectorConfig including "default" named vector
- Migrates data using cursor-based pagination (efficient for large datasets)
- Uses batch operations for fast inserts
- Preserves all object properties and vectors
Note:
- This is a community-edited version of the draft of the script presented by the Dify Team.
- This script is not officially supported by the Dify Team.
- The original source for this script can be found at https://github.com/langgenius/dify/issues/27291#issuecomment-3501003678.
- The changes made in this script are:
- Retrieve Weaviate connection info from environment variables to make this script run in the Worker container.
- Switch to cursor-based pagination in "replace_old_collection", since the migration could fail with large collections.
- Fix an issue where both the old and new collections remained without being deleted after migrating an empty collection.
"""
import os
import weaviate
from weaviate.classes.config import Configure, VectorDistances
import sys
@@ -17,62 +24,68 @@ import time
from typing import List, Dict, Any
# Configuration
WEAVIATE_HOST = "localhost"
WEAVIATE_PORT = 8080
WEAVIATE_GRPC_PORT = 50051
WEAVIATE_API_KEY = "WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih"
BATCH_SIZE = 100
WEAVIATE_ENDPOINT = os.getenv("WEAVIATE_ENDPOINT", "http://weaviate:8080")
WEAVIATE_GRPC_ENDPOINT = os.getenv("WEAVIATE_GRPC_ENDPOINT", "grpc://weaviate:50051")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih")
BATCH_SIZE = 1000
WEAVIATE_HOST = WEAVIATE_ENDPOINT.split("//")[-1].split(":")[0]
WEAVIATE_PORT = int(WEAVIATE_ENDPOINT.split(":")[-1])
WEAVIATE_GRPC_PORT = int(WEAVIATE_GRPC_ENDPOINT.split(":")[-1])
def identify_old_collections(client: weaviate.WeaviateClient) -> List[str]:
"""Identify collections that need migration (those without vectorConfig)"""
collections_to_migrate = []
all_collections = client.collections.list_all()
print(f"Found {len(all_collections)} total collections")
for collection_name in all_collections.keys():
# Only check Vector_index collections (Dify knowledge bases)
if not collection_name.startswith("Vector_index_"):
continue
collection = client.collections.get(collection_name)
config = collection.config.get()
# Check if this collection has the old schema
if config.vector_config is None:
collections_to_migrate.append(collection_name)
print(f" - {collection_name}: OLD SCHEMA (needs migration)")
else:
print(f" - {collection_name}: NEW SCHEMA (skip)")
return collections_to_migrate
def get_collection_schema(client: weaviate.WeaviateClient, collection_name: str) -> Dict[str, Any]:
def get_collection_schema(
client: weaviate.WeaviateClient, collection_name: str
) -> Dict[str, Any]:
"""Get the full schema of a collection via REST API"""
import requests
response = requests.get(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"Failed to get schema: {response.text}")
def create_new_collection(client: weaviate.WeaviateClient, old_name: str, schema: Dict[str, Any]) -> str:
def create_new_collection(
client: weaviate.WeaviateClient, old_name: str, schema: Dict[str, Any]
) -> str:
"""Create a new collection with updated schema using REST API"""
import requests
# Generate new collection name
new_name = f"{old_name}_migrated"
print(f"Creating new collection: {new_name}")
# Build new schema with proper vectorConfig
# Note: When using vectorConfig (named vectors), we don't set class-level vectorizer
new_schema = {
@@ -81,91 +94,88 @@ def create_new_collection(client: weaviate.WeaviateClient, old_name: str, schema
# Do NOT set class-level vectorizer when using vectorConfig
"vectorConfig": {
"default": {
"vectorizer": {
"none": {}
},
"vectorizer": {"none": {}},
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"distance": "cosine",
"ef": -1,
"efConstruction": 128,
"maxConnections": 32
}
"maxConnections": 32,
},
}
},
"properties": []
"properties": [],
}
# Copy properties from old schema
if "properties" in schema:
new_schema["properties"] = schema["properties"]
# Create collection via REST API
response = requests.post(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema",
f"{WEAVIATE_ENDPOINT}/v1/schema",
json=new_schema,
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
if response.status_code not in [200, 201]:
raise Exception(f"Failed to create collection: {response.text}")
print(f" Created new collection: {new_name}")
return new_name
def migrate_collection_data(
client: weaviate.WeaviateClient,
old_collection_name: str,
new_collection_name: str
client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str
) -> int:
"""Migrate data from old collection to new collection using cursor-based pagination"""
old_collection = client.collections.get(old_collection_name)
new_collection = client.collections.get(new_collection_name)
total_migrated = 0
cursor = None
print(f"Migrating data from {old_collection_name} to {new_collection_name}")
while True:
# Fetch batch of objects using cursor-based pagination
if cursor is None:
# First batch
response = old_collection.query.fetch_objects(
limit=BATCH_SIZE,
include_vector=True
limit=BATCH_SIZE, include_vector=True
)
else:
# Subsequent batches using cursor
response = old_collection.query.fetch_objects(
limit=BATCH_SIZE,
include_vector=True,
after=cursor
limit=BATCH_SIZE, include_vector=True, after=cursor
)
objects = response.objects
if not objects:
break
# Use batch insert for efficiency
with new_collection.batch.dynamic() as batch:
for obj in objects:
# Prepare properties
properties = obj.properties
# Add object with vector
batch.add_object(
properties=properties,
vector=obj.vector["default"] if isinstance(obj.vector, dict) else obj.vector,
uuid=obj.uuid
vector=(
obj.vector["default"]
if isinstance(obj.vector, dict)
else obj.vector
),
uuid=obj.uuid,
)
total_migrated += len(objects)
print(f" Migrated {total_migrated} objects...")
# Update cursor for next iteration
if len(objects) < BATCH_SIZE:
# Last batch
@@ -173,36 +183,34 @@ def migrate_collection_data(
else:
# Get the last object's UUID for cursor
cursor = objects[-1].uuid
print(f" Total migrated: {total_migrated} objects")
return total_migrated
def verify_migration(
client: weaviate.WeaviateClient,
old_collection_name: str,
new_collection_name: str
client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str
):
"""Verify that the migration was successful"""
old_collection = client.collections.get(old_collection_name)
new_collection = client.collections.get(new_collection_name)
# Count objects in both collections
old_count_response = old_collection.query.fetch_objects(limit=1)
new_count_response = new_collection.query.fetch_objects(limit=1)
# Get aggregation for accurate counts
old_agg = old_collection.aggregate.over_all(total_count=True)
new_agg = new_collection.aggregate.over_all(total_count=True)
old_count = old_agg.total_count
new_count = new_agg.total_count
print(f"\nVerification:")
print(f" Old collection ({old_collection_name}): {old_count} objects")
print(f" New collection ({new_collection_name}): {new_count} objects")
if old_count == new_count:
print(f" Status: SUCCESS - Counts match!")
return True
@@ -212,109 +220,132 @@ def verify_migration(
def replace_old_collection(
client: weaviate.WeaviateClient,
old_collection_name: str,
new_collection_name: str
client: weaviate.WeaviateClient, old_collection_name: str, new_collection_name: str
):
"""Replace old collection with migrated one by recreating with original name"""
import requests
print(f"\nReplacing old collection with migrated data...")
# Step 1: Get data from migrated collection
print(f" Step 1: Getting data from migrated collection...")
migrated = client.collections.get(new_collection_name)
objects = migrated.query.fetch_objects(include_vector=True, limit=10000)
print(f" Found {len(objects.objects)} objects")
# Step 2: Delete old collection
print(f" Step 2: Deleting old collection...")
# Step 1: Delete old collection
print(f" Step 1: Deleting old collection...")
response = requests.delete(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{old_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
f"{WEAVIATE_ENDPOINT}/v1/schema/{old_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
if response.status_code != 200:
print(f" Warning: Could not delete old collection: {response.text}")
else:
print(f" Deleted")
# Step 3: Get schema from migrated collection
print(f" Step 3: Getting schema from migrated collection...")
# Step 2: Get schema from migrated collection
print(f" Step 2: Getting schema from migrated collection...")
schema_response = requests.get(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{new_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
f"{WEAVIATE_ENDPOINT}/v1/schema/{new_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
schema = schema_response.json()
schema["class"] = old_collection_name
# Step 4: Create collection with original name and new schema
print(f" Step 4: Creating collection with original name...")
# Step 3: Create collection with original name and new schema
print(f" Step 3: Creating collection with original name...")
create_response = requests.post(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema",
f"{WEAVIATE_ENDPOINT}/v1/schema",
json=schema,
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
if create_response.status_code not in [200, 201]:
raise Exception(f"Failed to create collection: {create_response.text}")
print(f" Created")
# Step 5: Copy data to collection with original name
print(f" Step 5: Copying data to original collection name...")
# Step 4: Copy data to collection with original name using cursor-based pagination
print(f" Step 4: Copying data to original collection name...")
migrated_collection = client.collections.get(new_collection_name)
new_collection = client.collections.get(old_collection_name)
with new_collection.batch.dynamic() as batch:
for obj in objects.objects:
batch.add_object(
properties=obj.properties,
vector=obj.vector,
uuid=obj.uuid
total_copied = 0
cursor = None
while True:
# Fetch batch of objects using cursor-based pagination
if cursor is None:
# First batch
response = migrated_collection.query.fetch_objects(
include_vector=True, limit=BATCH_SIZE
)
count = new_collection.aggregate.over_all(total_count=True).total_count
print(f" Copied {count} objects")
# Step 6: Delete the temporary migrated collection
print(f" Step 6: Cleaning up temporary migrated collection...")
else:
# Subsequent batches using cursor
response = migrated_collection.query.fetch_objects(
include_vector=True, limit=BATCH_SIZE, after=cursor
)
objects = response.objects
if not objects:
break
# Use batch insert for efficiency
with new_collection.batch.dynamic() as batch:
for obj in objects:
batch.add_object(
properties=obj.properties, vector=obj.vector, uuid=obj.uuid
)
total_copied += len(objects)
print(f" Copied {total_copied} objects...")
# Update cursor for next iteration
if len(objects) < BATCH_SIZE:
break
else:
cursor = objects[-1].uuid
print(f" Total copied: {total_copied} objects")
# Step 5: Delete the temporary migrated collection
print(f" Step 5: Cleaning up temporary migrated collection...")
response = requests.delete(
f"http://{WEAVIATE_HOST}:{WEAVIATE_PORT}/v1/schema/{new_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"}
f"{WEAVIATE_ENDPOINT}/v1/schema/{new_collection_name}",
headers={"Authorization": f"Bearer {WEAVIATE_API_KEY}"},
)
if response.status_code == 200:
print(f" Cleaned up")
print(f"\n SUCCESS! {old_collection_name} now has the new schema with {count} objects")
print(
f"\n SUCCESS! {old_collection_name} now has the new schema with {total_copied} objects"
)
return True
def migrate_all_collections():
"""Main migration function"""
print("=" * 80)
print("Weaviate Collection Migration Script")
print("Migrating from Weaviate 1.19.0 schema to 1.27.0+ schema")
print("=" * 80)
print()
client = weaviate.connect_to_local(
host=WEAVIATE_HOST,
port=WEAVIATE_PORT,
grpc_port=WEAVIATE_GRPC_PORT,
auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY)
auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY),
)
try:
# Step 1: Identify collections that need migration
print("Step 1: Identifying collections that need migration...")
collections_to_migrate = identify_old_collections(client)
if not collections_to_migrate:
print("\nNo collections need migration. All collections are up to date!")
return
print(f"\nFound {len(collections_to_migrate)} collections to migrate:")
for col in collections_to_migrate:
print(f" - {col}")
# Confirm before proceeding
print("\nThis script will:")
print("1. Create new collections with updated schema")
@@ -322,54 +353,58 @@ def migrate_all_collections():
print("3. Verify the migration")
print("4. Optionally rename collections to activate the new ones")
print()
# Step 2: Migrate each collection
for collection_name in collections_to_migrate:
print("\n" + "=" * 80)
print(f"Migrating: {collection_name}")
print("=" * 80)
try:
# Get old schema
schema = get_collection_schema(client, collection_name)
# Create new collection
new_collection_name = create_new_collection(client, collection_name, schema)
new_collection_name = create_new_collection(
client, collection_name, schema
)
# Migrate data
migrated_count = migrate_collection_data(client, collection_name, new_collection_name)
migrated_count = migrate_collection_data(
client, collection_name, new_collection_name
)
# Verify migration
success = verify_migration(client, collection_name, new_collection_name)
if success and migrated_count > 0:
if success:
print(f"\nMigration successful for {collection_name}!")
print(f"New collection: {new_collection_name}")
# Automatically replace old collection with migrated one
try:
replace_old_collection(client, collection_name, new_collection_name)
replace_old_collection(
client, collection_name, new_collection_name
)
except Exception as e:
print(f"\nWarning: Could not automatically replace collection: {e}")
print(
f"\nWarning: Could not automatically replace collection: {e}"
)
print(f"\nTo activate manually:")
print(f"1. Delete the old collection: {collection_name}")
print(f"2. Rename {new_collection_name} to {collection_name}")
except Exception as e:
print(f"\nError migrating {collection_name}: {e}")
print(f"Skipping this collection and continuing...")
continue
print("\n" + "=" * 80)
print("Migration Complete!")
print("=" * 80)
print("\nSummary:")
print(f" Collections migrated: {len(collections_to_migrate)}")
print(f"\nNext steps:")
print(f"1. Test the new collections (*_migrated)")
print(f"2. If everything works, delete or backup the old collections")
print(f"3. Rename the new collections to remove '_migrated' suffix")
finally:
client.close()
@@ -383,6 +418,6 @@ if __name__ == "__main__":
except Exception as e:
print(f"\n\nFatal error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -6,20 +6,22 @@ title: Weaviate Migration Guide upgrading to Client v4 and Server 1.27+
## Overview
Starting with **Dify v1.9.2**, the weaviate-client has been upgraded from v3 to v4.17.0. This upgrade brings significant performance improvements and better stability, but requires **Weaviate server version 1.27.0 or higher**.
Starting with **Dify v1.9.2**, the weaviate-client has been upgraded from v3 to v4.17.0. This upgrade brings significant performance improvements and better stability, but requires **Weaviate server version 1.27.0 or higher**.
<Warning>
**BREAKING CHANGE:** The new weaviate-client v4 is NOT backward compatible with Weaviate server versions below 1.27.0. If you are running a self-hosted Weaviate instance on version 1.19.0 or older, you must upgrade your Weaviate server before upgrading Dify.
**BREAKING CHANGE**: The new weaviate-client v4 is NOT backward compatible with Weaviate server versions below 1.27.0. If you are running a self-hosted Weaviate instance on version 1.19.0 or older, you must upgrade your Weaviate server before upgrading Dify.
</Warning>
### Who Is Affected?
This migration affects:
- Self-hosted Dify users running their own Weaviate instances on versions below 1.27.0
- Users currently on Weaviate server version 1.19.0-1.26.x
- Users upgrading to Dify versions with weaviate-client v4
**Not affected:**
**Not affected**:
- Cloud-hosted Weaviate users (Weaviate Cloud manages the server version)
- Users already on Weaviate 1.27.0+ can upgrade Dify without additional steps
- Users running Dify's default Docker Compose setup (Weaviate version is updated automatically)
@@ -30,31 +32,31 @@ This migration affects:
The weaviate-client v4 introduces several breaking changes:
1. **Minimum Server Version:** Requires Weaviate server 1.27.0 or higher
2. **API Changes:** New import structure (`weaviate.classes` instead of `weaviate.client`)
3. **gRPC Support:** Uses gRPC by default on port 50051 for improved performance
4. **Authentication Changes:** Updated authentication methods and configuration
1. **Minimum Server Version**: Requires Weaviate server 1.27.0 or higher
2. **API Changes**: New import structure (`weaviate.classes` instead of `weaviate.client`)
3. **gRPC Support**: Uses gRPC by default on port 50051 for improved performance
4. **Authentication Changes**: Updated authentication methods and configuration
### Why Upgrade?
- **Performance:** Significantly faster query and import operations via gRPC (50051)
- **Stability:** Better connection handling and error recovery
- **Future Compatibility:** Access to latest Weaviate features and ongoing support
- **Security:** Weaviate 1.19.0 is over a year old and no longer receives security updates
- **Performance**: Significantly faster query and import operations via gRPC (50051)
- **Stability**: Better connection handling and error recovery
- **Future Compatibility**: Access to latest Weaviate features and ongoing support
- **Security**: Weaviate 1.19.0 is over a year old and no longer receives security updates
## Version Compatibility Matrix
| Dify Version | Weaviate-client Version | Compatible Weaviate Server Versions |
|--------------|-------------------------|-------------------------------------|
| ------------ | ----------------------- | ----------------------------------- |
| ≤ 1.9.1 | v3.x | 1.19.0 - 1.26.x |
| ≥ 1.9.2 | v4.17.0 | 1.27.0+ (tested up to 1.33.1) |
| ≥ 1.9.2 | v4.17.0 | 1.27.0+ (tested up to 1.33.1) |
<Info>
This migration applies to any Dify version using weaviate-client v4.17.0 or higher.
This migration applies to any Dify version using weaviate-client v4.17.0 or higher.
</Info>
<Info>
Weaviate server version 1.19.0 was released over a year ago and is now outdated. Upgrading to 1.27.0+ provides access to numerous improvements in performance, stability, and features.
Weaviate server version 1.19.0 was released over a year ago and is now outdated. Upgrading to 1.27.0+ provides access to numerous improvements in performance, stability, and features.
</Info>
## Prerequisites
@@ -62,17 +64,21 @@ Weaviate server version 1.19.0 was released over a year ago and is now outdated.
Before starting the migration, complete these steps:
1. **Check Your Current Weaviate Version**
```bash
curl http://localhost:8080/v1/meta
```
Look for the `version` field in the response.
2. **Backup Your Data**
- Create a complete backup of your Weaviate data
- Backup your Docker volumes if using Docker Compose
- Document your current configuration settings
3. **Review System Requirements**
- Ensure sufficient disk space for database migration
- Verify network connectivity between Dify and Weaviate
- Confirm gRPC port (50051) is accessible if using external Weaviate
@@ -88,17 +94,17 @@ Choose the migration path that matches your deployment setup and current Weaviat
### Choose Your Path
- **Path A Migration with Backup (from 1.19):** Recommended if you are still on Weaviate 1.19. You will create a backup, upgrade to 1.27+, repair any orphaned data, and then migrate the schema.
- **Path B Direct Recovery (already on 1.27+):** Use this if you already upgraded to 1.27+ and your knowledge bases stopped working. This path focuses on repairing the data layout and running the schema migration.
- **Path A Migration with Backup (from 1.19)**: Recommended if you are still on Weaviate 1.19. You will create a backup, upgrade to 1.27+, repair any orphaned data, and then migrate the schema.
- **Path B Direct Recovery (already on 1.27+)**: Use this if you already upgraded to 1.27+ and your knowledge bases stopped working. This path focuses on repairing the data layout and running the schema migration.
<Warning>
Do **not** attempt to downgrade back to 1.19. The schema format is incompatible and will lead to data loss.
Do **not** attempt to downgrade back to 1.19. The schema format is incompatible and will lead to data loss.
</Warning>
### Path A: Migration with Backup (From 1.19)
<Info>
Safest path. Creates a backup before upgrading so you can restore if anything goes wrong.
Safest path. Creates a backup before upgrading so you can restore if anything goes wrong.
</Info>
#### Prerequisites
@@ -112,18 +118,18 @@ Safest path. Creates a backup before upgrading so you can restore if anything go
Edit `docker/docker-compose.yaml` so the `weaviate` service includes backup configuration:
```yaml
weaviate:
image: semitechnologies/weaviate:1.19.0
volumes:
- ./volumes/weaviate:/var/lib/weaviate
- ./volumes/weaviate_backups:/var/lib/weaviate/backups
ports:
- "8080:8080"
- "50051:50051"
environment:
ENABLE_MODULES: backup-filesystem
BACKUP_FILESYSTEM_PATH: /var/lib/weaviate/backups
# ... rest of your environment variables
weaviate:
image: semitechnologies/weaviate:1.19.0
volumes:
- ./volumes/weaviate:/var/lib/weaviate
- ./volumes/weaviate_backups:/var/lib/weaviate/backups
ports:
- "8080:8080"
- "50051:50051"
environment:
ENABLE_MODULES: backup-filesystem
BACKUP_FILESYSTEM_PATH: /var/lib/weaviate/backups
# ... rest of your environment variables
```
Restart Weaviate to apply the change:
@@ -137,7 +143,7 @@ sleep 10
#### Step A2: Create a Backup
1. **List your collections:**
1. **List your collections**:
```bash
curl -s -H "Authorization: Bearer <WEAVIATE_API_KEY>" \
@@ -151,7 +157,7 @@ sleep 10
"
```
2. **Trigger the backup:** include specific collection names if you prefer.
2. **Trigger the backup**: include specific collection names if you prefer.
```bash
curl -X POST \
@@ -164,7 +170,7 @@ sleep 10
}'
```
3. **Check backup status:**
3. **Check backup status**:
```bash
sleep 5
@@ -173,7 +179,7 @@ sleep 10
python3 -m json.tool | grep status
```
4. **Verify backup files exist:**
4. **Verify backup files exist**:
```bash
ls -lh docker/volumes/weaviate_backups/kb-backup/
@@ -181,7 +187,7 @@ sleep 10
#### Step A3: Upgrade to Weaviate 1.27+
1. **Upgrade Dify to a version that ships Weaviate 1.27+:**
1. **Upgrade Dify to a version that ships Weaviate 1.27+**:
```bash
cd /path/to/dify
@@ -189,13 +195,13 @@ sleep 10
git checkout main # or a tagged release that includes the upgrade
```
2. **Confirm the new Weaviate image:**
2. **Confirm the new Weaviate image**:
```bash
grep "image: semitechnologies/weaviate" docker/docker-compose.yaml
```
3. **Restart with the new version:**
3. **Restart with the new version**:
```bash
cd docker
@@ -206,6 +212,10 @@ sleep 10
#### Step A4: Fix Orphaned LSM Data (if present)
You can fix orphaned LSM data either from the host or inside the container:
**Option A: From host (if volumes are mounted)**:
```bash
cd docker/volumes/weaviate
@@ -226,6 +236,32 @@ docker compose restart weaviate
sleep 15
```
**Option B: Inside Weaviate container (recommended)**:
```bash
cd /path/to/dify/docker
docker compose exec -it weaviate /bin/sh
# Inside container
cd /var/lib/weaviate
for dir in vector_index_*_node_*_lsm; do
[ -d "$dir" ] || continue
index_id=$(echo "$dir" | sed -n 's/vector_index_\([^_]*_[^_]*_[^_]*_[^_]*_[^_]*\)_node_.*/\1/p')
shard_id=$(echo "$dir" | sed -n 's/.*_node_\([^_]*\)_lsm/\1/p')
mkdir -p "vector_index_${index_id}_node/$shard_id/lsm"
cp -a "$dir/"* "vector_index_${index_id}_node/$shard_id/lsm/"
echo "✓ Copied $dir"
done
exit
# Restart Weaviate
docker compose restart weaviate
sleep 15
```
#### Step A5: Migrate the Schema
1. **Install dependencies** (in a temporary virtualenv is fine):
@@ -237,13 +273,35 @@ sleep 15
pip install weaviate-client requests
```
2. **Run the [migration script](https://github.com/langgenius/dify-docs/blob/main/assets/migrate_weaviate_collections.py):**
2. **Run the [migration script](https://github.com/langgenius/dify-docs/blob/main/assets/migrate_weaviate_collections.py)** either locally or inside the Worker container.\
**Option A: Run locally (if you have Python 3.11+ and dependencies installed)**:
```bash
python3 migrate_weaviate_collections.py
```
3. **Restart Dify services:**
**Option B: Run inside Worker container (recommended for Docker setups)**:
```bash
# Copy script to storage directory
cp migrate_weaviate_collections.py /path/to/dify/docker/volumes/app/storage/
# Enter worker container
cd /path/to/dify/docker
docker compose exec -it worker /bin/bash
# Run migration script (use --no-cache for Dify 1.11.0+)
uv run --no-cache /app/api/storage/migrate_weaviate_collections.py
# Exit container
exit
```
<Info>
The migration script uses environment variables for configuration, making it suitable for running inside Docker containers. For Dify 1.11.0+, if you encounter permission errors with `uv`, use `uv run --no-cache` instead.
</Info>
3. **Restart Dify services**:
```bash
cd docker
@@ -251,16 +309,20 @@ sleep 15
sleep 15
```
4. **Verify in the UI:** open Dify, test retrieval against your migrated knowledge bases.
4. **Verify in the UI**: open Dify, test retrieval against your migrated knowledge bases.
<Warning>
For large collections (over 10,000 objects), verify that the object count matches between old and new collections. The migration script will display verification counts automatically.
</Warning>
<Info>
After confirming a healthy migration, you can delete `weaviate_migration_env` and the backup files to reclaim disk space.
After confirming a healthy migration, you can delete `weaviate_migration_env` and the backup files to reclaim disk space.
</Info>
### Path B: Direct Recovery (Already on 1.27+)
<Warning>
Only use this path if you already upgraded to 1.27+ and your knowledge bases stopped working. You cannot create a 1.19 backup anymore, so you must repair the data in place.
Only use this path if you already upgraded to 1.27+ and your knowledge bases stopped working. You cannot create a 1.19 backup anymore, so you must repair the data in place.
</Warning>
#### Prerequisites
@@ -271,10 +333,13 @@ Only use this path if you already upgraded to 1.27+ and your knowledge bases sto
#### Step B1: Repair Orphaned LSM Data
Stop Weaviate and fix orphaned LSM data:
```bash
cd docker
cd /path/to/dify/docker
docker compose stop weaviate
# Option A: From host (if volumes are mounted)
cd volumes/weaviate
for dir in vector_index_*_node_*_lsm; do
@@ -288,12 +353,24 @@ for dir in vector_index_*_node_*_lsm; do
echo "✓ Copied $dir"
done
# Option B: Inside container (recommended)
docker compose run --rm --entrypoint /bin/sh weaviate -c "
cd /var/lib/weaviate
for dir in vector_index_*_node_*_lsm; do
[ -d \"\$dir\" ] || continue
index_id=\$(echo \"\$dir\" | sed -n 's/vector_index_\([^_]*_[^_]*_[^_]*_[^_]*_[^_]*\)_node_.*/\1/p')
shard_id=\$(echo \"\$dir\" | sed -n 's/.*_node_\([^_]*\)_lsm/\1/p')
mkdir -p \"vector_index_\${index_id}_node/\$shard_id/lsm\"
cp -a \"\$dir/\"* \"vector_index_\${index_id}_node/\$shard_id/lsm/\"
echo \"✓ Copied \$dir\"
done
"
```
Restart Weaviate:
```bash
cd ../..
docker compose start weaviate
sleep 15
```
@@ -316,7 +393,30 @@ curl -s -H "Authorization: Bearer <WEAVIATE_API_KEY>" \
#### Step B2: Run the Schema Migration
Follow the same commands as [Step A5](#step-a5:-migrate-the-schema). Create the virtualenv if needed, install `weaviate-client` 4.x, run `migrate_weaviate_collections.py`, then restart `api`, `worker`, and `worker_beat`.
Follow the same commands as [Step A5](#step-a5%3A-migrate-the-schema). You can run the script locally or inside the Worker container:
**To run inside Worker container**:
```bash
# Copy script to storage directory
cp migrate_weaviate_collections.py /path/to/dify/docker/volumes/app/storage/
# Enter worker container
cd /path/to/dify/docker
docker compose exec -it worker /bin/bash
# Run migration script
uv run --no-cache /app/api/storage/migrate_weaviate_collections.py
# Exit and restart services
exit
docker compose restart api worker worker_beat
```
<Info>
The migration script uses cursor-based pagination to safely handle large
collections. Verify object counts match after migration completes.
</Info>
#### Step B3: Verify in Dify
@@ -327,22 +427,24 @@ Follow the same commands as [Step A5](#step-a5:-migrate-the-schema). Create the
## Data Migration for Legacy Versions
<Warning>
### CRITICAL: Data Migration Required
**CRITICAL: Data Migration Required**
**Your existing knowledge bases will NOT work after upgrade without migration!**
### Why Migration is Needed:
**Why Migration is Needed**:
- Old data: Created with Weaviate v3 client (simple schema)
- New code: Requires Weaviate v4 format (extended schema)
- **Incompatible**: Old data missing required properties
### Migration Options:
**Migration Options**:
##### Option A: Use Weaviate Backup/Restore
- Option A: Use Weaviate Backup/Restore
##### Option B: Re-index from Original Documents
- Option B: Re-index from Original Documents
- Option C: Keep Old Weaviate (Don't Upgrade Yet) If you can't afford downtime or data loss.
##### Option C: Keep Old Weaviate (Don't Upgrade Yet) If you can't afford downtime or data loss.
</Warning>
### Automatic Migration
@@ -379,7 +481,7 @@ curl -X POST "http://localhost:8080/v1/backups/filesystem/pre-migration-backup/r
```
<Info>
For comprehensive migration guidance, especially for complex schemas or large datasets, refer to the official [Weaviate Migration Guide](https://weaviate.io/developers/weaviate/installation/migration).
For comprehensive migration guidance, especially for complex schemas or large datasets, refer to the official [Weaviate Migration Guide](https://weaviate.io/developers/weaviate/installation/migration).
</Info>
## Configuration Changes
@@ -390,15 +492,17 @@ The following new environment variable is available in Dify versions with weavia
#### WEAVIATE_GRPC_ENDPOINT
**Description:** Specifies the gRPC endpoint for Weaviate connections. Using gRPC significantly improves performance for batch operations and queries.
**Description**: Specifies the gRPC endpoint for Weaviate connections. Using gRPC significantly improves performance for batch operations and queries.
**Format:** `hostname:port` (NO protocol prefix)
**Format**: `hostname:port` (NO protocol prefix)
**Default Ports**:
**Default Ports:**
- Insecure: 50051
- Secure (TLS): 443
**Examples:**
**Examples**:
```bash
# Docker Compose (internal network)
WEAVIATE_GRPC_ENDPOINT=weaviate:50051
@@ -414,17 +518,17 @@ WEAVIATE_GRPC_ENDPOINT=your-instance.weaviate.cloud:443
```
<Warning>
Do NOT include protocol prefixes like `grpc://` or `http://` in the WEAVIATE_GRPC_ENDPOINT value. Use only `hostname:port`.
Do NOT include protocol prefixes like `grpc://` or `http://` in the WEAVIATE_GRPC_ENDPOINT value. Use only `hostname:port`.
</Warning>
### Updated Environment Variables
All existing Weaviate environment variables remain the same:
- **WEAVIATE_ENDPOINT:** HTTP endpoint for Weaviate (e.g., `http://weaviate:8080`)
- **WEAVIATE_API_KEY:** API key for authentication (if enabled)
- **WEAVIATE_BATCH_SIZE:** Batch size for imports (default: 100)
- **WEAVIATE_GRPC_ENABLED:** Enable/disable gRPC (default: true in v4)
- **WEAVIATE_ENDPOINT**: HTTP endpoint for Weaviate (e.g., `http://weaviate:8080`)
- **WEAVIATE_API_KEY**: API key for authentication (if enabled)
- **WEAVIATE_BATCH_SIZE**: Batch size for imports (default: 100)
- **WEAVIATE_GRPC_ENABLED**: Enable/disable gRPC (default: true in v4)
### Complete Configuration Example
@@ -446,8 +550,6 @@ WEAVIATE_GRPC_ENDPOINT=weaviate:50051
WEAVIATE_BATCH_SIZE=100
```
## Verification Steps
After completing the migration, verify everything is working correctly:
@@ -483,7 +585,7 @@ Look for messages indicating successful connection without "No module named 'wea
6. Check that status changes from "QUEUING" → "INDEXING" → "AVAILABLE"
<Info>
If documents get stuck in "QUEUING" status, check that the Celery worker is running: `docker compose logs worker`
If documents get stuck in "QUEUING" status, check that the Celery worker is running: `docker compose logs worker`.
</Info>
### 4. Test Vector Search
@@ -506,16 +608,17 @@ docker compose logs -f api | grep -i "query_time\|duration"
```
<Info>
With gRPC properly configured, vector search queries should be 2-5x faster compared to HTTP-only connections.
With gRPC properly configured, vector search queries should be 2-5x faster compared to HTTP-only connections.
</Info>
## Troubleshooting
### Issue: "No module named 'weaviate.classes'"
**Cause:** The weaviate-client v4 is not installed, or v3 is still being used.
**Cause**: The weaviate-client v4 is not installed, or v3 is still being used.
**Solution**:
**Solution:**
```bash
# For Docker installations, ensure you're running the correct Dify version
docker compose pull
@@ -529,63 +632,67 @@ pip install weaviate-client==4.17.0
### Issue: Connection Refused on gRPC Port (50051)
**Cause:** Port 50051 is not exposed, not accessible, or Weaviate is not listening on it.
**Cause**: Port 50051 is not exposed, not accessible, or Weaviate is not listening on it.
**Solution:**
**Solution**:
1. **For Docker Compose users with bundled Weaviate:**
1. **For Docker Compose users with bundled Weaviate**:
The port is available internally between containers. No action needed unless you're connecting from outside Docker.
2. **For external Weaviate:**
2. **For external Weaviate**:
```bash
# Check if Weaviate is listening on 50051
docker ps | grep weaviate
# Look for "0.0.0.0:50051->50051/tcp"
# If not exposed, restart with port mapping
docker run -p 8080:8080 -p 50051:50051 ...
```
3. **Check firewall rules:**
3. **Check firewall rules**:
```bash
# Linux
sudo ufw allow 50051/tcp
# Check if port is listening
netstat -tlnp | grep 50051
```
### Issue: Authentication Errors (401 Unauthorized)
**Cause:** API key mismatch or authentication configuration issue.
**Cause**: API key mismatch or authentication configuration issue.
**Solution:**
**Solution**:
1. Verify API key matches in both Weaviate and Dify:
```bash
# Check Weaviate authentication
curl http://localhost:8080/v1/meta | jq '.authentication'
# Check Dify configuration
docker compose exec api env | grep WEAVIATE_API_KEY
```
2. If using anonymous access:
```yaml
# Weaviate docker-compose.yaml
weaviate:
environment:
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
AUTHENTICATION_APIKEY_ENABLED: 'false'
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: "true"
AUTHENTICATION_APIKEY_ENABLED: "false"
```
Then remove `WEAVIATE_API_KEY` from Dify configuration.
### Issue: Documents Stuck in "QUEUING" Status
**Cause:** Celery worker not running or not connected to Redis.
**Cause**: Celery worker not running or not connected to Redis.
**Solution:**
**Solution**:
```bash
# Check if worker is running
@@ -604,22 +711,25 @@ docker compose restart worker
### Issue: Slow Performance After Migration
**Cause:** gRPC not enabled or configured incorrectly.
**Cause**: gRPC not enabled or configured incorrectly.
**Solution:**
**Solution**:
1. Verify gRPC configuration:
```bash
docker compose exec api env | grep WEAVIATE_GRPC
```
Should show:
```
WEAVIATE_GRPC_ENABLED=true
WEAVIATE_GRPC_ENDPOINT=weaviate:50051
```
2. Test gRPC connectivity:
```bash
docker exec -it dify-api-1 nc -zv weaviate 50051
# Should return "succeeded"
@@ -629,21 +739,24 @@ docker compose restart worker
### Issue: Schema Migration Errors
**Cause:** Incompatible schema changes between Weaviate versions or corrupted data.
**Cause**: Incompatible schema changes between Weaviate versions or corrupted data.
**Solution:**
**Solution**:
1. Check Weaviate logs for specific error messages:
```bash
docker compose logs weaviate | tail -100
```
2. List current schema:
```bash
curl http://localhost:8080/v1/schema
```
3. If necessary, delete corrupted collections (⚠️ this deletes all data):
```bash
# Backup first!
curl -X DELETE http://localhost:8080/v1/schema/YourCollectionName
@@ -655,14 +768,15 @@ docker compose restart worker
```
<Warning>
Deleting collections removes all data. Only do this if you have a backup and are prepared to re-index all content.
Deleting collections removes all data. Only do this if you have a backup and are prepared to re-index all content.
</Warning>
### Issue: Docker Volume Permission Errors
**Cause:** User ID mismatch in Docker containers.
**Cause**: User ID mismatch in Docker containers.
**Solution**:
**Solution:**
```bash
# Check ownership of Weaviate data directory
ls -la docker/volumes/weaviate/
@@ -674,6 +788,21 @@ sudo chown -R 1000:1000 docker/volumes/weaviate/
docker compose restart weaviate
```
### Issue: Permission Denied When Running Migration Script (Dify 1.11.0+)
**Cause**: The `/home/dify` directory may not exist in newer Dify versions, causing `uv` cache creation to fail.
**Solution**:
```bash
# Option 1: Use --no-cache flag (recommended)
uv run --no-cache migrate_weaviate_collections.py
# Option 2: Run as root user
docker compose exec -u root worker /bin/bash
uv run migrate_weaviate_collections.py
```
## Rollback Plan
If the migration fails and you need to rollback:
@@ -724,7 +853,7 @@ docker compose logs | grep -i error
```
<Info>
Always test the rollback procedure in a staging environment first if possible. Maintain multiple backup copies before attempting major migrations.
Always test the rollback procedure in a staging environment first if possible. Maintain multiple backup copies before attempting major migrations.
</Info>
## Additional Resources
@@ -753,16 +882,16 @@ Always test the rollback procedure in a staging environment first if possible. M
This migration brings important improvements to Dify's vector storage capabilities:
**Better Performance:** gRPC support dramatically improves query and import speeds (2-5x faster)
- **Better Performance**: gRPC support dramatically improves query and import speeds (2-5x faster)
**Improved Stability:** Enhanced connection handling and error recovery
- **Improved Stability**: Enhanced connection handling and error recovery
**Security:** Access to security updates and patches not available in Weaviate 1.19.0
- **Security**: Access to security updates and patches not available in Weaviate 1.19.0
**Future-Proof:** Access to latest Weaviate features and ongoing support
- **Future-Proof**: Access to latest Weaviate features and ongoing support
While this is a breaking change requiring server upgrade for users on old versions, the benefits significantly outweigh the migration effort. Most Docker Compose users can complete the migration in under 15 minutes with the automatic update.
<Info>
If you encounter any issues not covered in this guide, please report them on the [Dify GitHub Issues page](https://github.com/langgenius/dify/issues) with the label "weaviate" and "migration".
</Info>
If you encounter any issues not covered in this guide, please report them on the [Dify GitHub Issues page](https://github.com/langgenius/dify/issues) with the label "weaviate" and "migration".
</Info>