To learn about the HNSW (Hierarchical Navigable Small World) index for AI and vector search, here’s a breakdown of the key concepts and steps:
1. What is HNSW (Hierarchical Navigable Small World)?
- HNSW is a data structure used for approximate nearest neighbor (ANN) search in high-dimensional spaces. It is highly efficient for large-scale vector searches, especially in AI tasks where embedding vectors are used for tasks like recommendations, similarity searches, and clustering.
- It is based on the small-world graph model, which optimizes both speed and accuracy in ANN searches.
2. Why HNSW?
- High efficiency in both time and memory, making it suitable for real-time search applications.
- Handles high-dimensional data very well (e.g., word embeddings, image features).
- Scales to millions or even billions of vectors with low memory overhead.
3. Learning the Basics of Vector Indexing with HNSW
- Vector embeddings: In AI, vectors are often used to represent data like words, sentences, or images. Before learning HNSW, it’s important to understand how AI models generate these vectors (e.g., using embeddings from models like BERT, GPT, or image models).
- Indexing: HNSW builds a multi-layer graph to index these high-dimensional vectors. Each node in the graph represents a vector, and the edges connect it to its nearest neighbors.
- Search algorithm: When a query vector is introduced, HNSW navigates the graph to find vectors that are most similar (nearest neighbors) in terms of cosine or Euclidean distance.
import numpy as np
from typing import List, Dict, Set
import random
class Node:
def __init__(self, data, id):
self.data = data # vector data
self.id = id
self.neighbors: Dict[int, Set[int]] = {} # layer -> set of neighbor ids
self.max_layer = 0
class HNSW:
def __init__(self, dim: int, M: int = 5, ef_construction: int = 100, max_layers: int = 4):
self.dim = dim
self.M = M # max number of connections per layer
self.M0 = 2 * M # max connections for layer 0
self.ef_construction = ef_construction
self.max_layers = max_layers
self.nodes: Dict[int, Node] = {}
self.entry_point = None
def _generate_random_level(self) -> int:
"""Generate random level using exponential distribution"""
return int(-np.log(random.uniform(0, 1)) * self.M)
def _distance(self, a: np.ndarray, b: np.ndarray) -> float:
"""Calculate Euclidean distance between vectors"""
return np.linalg.norm(a - b)
def _select_neighbors(self, q: np.ndarray, candidates: List[int], M: int) -> List[int]:
"""Select M closest neighbors from candidates"""
if len(candidates) <= M:
return candidates
distances = [(c, self._distance(q, self.nodes[c].data)) for c in candidates]
distances.sort(key=lambda x: x[1])
return [x[0] for x in distances[:M]]
def _search_layer(self, q: np.ndarray, ep: Set[int], ef: int, layer: int) -> List[int]:
"""Search for nearest neighbors in a layer"""
visited = set()
candidates = [(self._distance(q, self.nodes[ep_id].data), ep_id) for ep_id in ep]
candidates.sort()
W = candidates[:ef] # working set
while candidates:
c_dist, c_id = candidates[0]
candidates = candidates[1:]
if c_dist > W[-1][0]:
break
# Examine neighbors
for neighbor_id in self.nodes[c_id].neighbors.get(layer, []):
if neighbor_id in visited:
continue
visited.add(neighbor_id)
d = self._distance(q, self.nodes[neighbor_id].data)
if len(W) < ef or d < W[-1][0]:
# Add to candidates
candidates.append((d, neighbor_id))
# Add to working set
W.append((d, neighbor_id))
W.sort()
if len(W) > ef:
W = W[:ef]
return [x[1] for x in W]
def insert(self, vector: np.ndarray) -> int:
"""Insert a new vector into the index"""
vector_id = len(self.nodes)
node = Node(vector, vector_id)
# Generate random level
level = min(self._generate_random_level(), self.max_layers - 1)
node.max_layer = level
# Initialize node's neighbors for each layer
for l in range(level + 1):
node.neighbors[l] = set()
self.nodes[vector_id] = node
# If this is the first node
if self.entry_point is None:
self.entry_point = vector_id
return vector_id
# Find entry point
curr = self.entry_point
curr_dist = self._distance(vector, self.nodes[curr].data)
# Search from top to bottom
for l in range(min(self.nodes[self.entry_point].max_layer, level), -1, -1):
changed = True
while changed:
changed = False
# Check neighbors at current layer
neighbors = self.nodes[curr].neighbors.get(l, set())
for neighbor_id in neighbors:
neighbor_dist = self._distance(vector, self.nodes[neighbor_id].data)
if neighbor_dist < curr_dist:
curr = neighbor_id
curr_dist = neighbor_dist
changed = True
# Select and connect neighbors for current layer
candidates = self._search_layer(vector, {curr}, self.ef_construction, l)
neighbors = self._select_neighbors(vector, candidates, self.M if l > 0 else self.M0)
# Add bidirectional connections
for neighbor_id in neighbors:
self.nodes[vector_id].neighbors[l].add(neighbor_id)
self.nodes[neighbor_id].neighbors[l].add(vector_id)
# Update entry point if necessary
if level > self.nodes[self.entry_point].max_layer:
self.entry_point = vector_id
return vector_id
def search(self, query: np.ndarray, k: int = 1) -> List[int]:
"""Search k nearest neighbors for query vector"""
if not self.entry_point:
return []
curr = self.entry_point
curr_dist = self._distance(query, self.nodes[curr].data)
# Search from top to bottom
for l in range(self.nodes[self.entry_point].max_layer, 0, -1):
changed = True
while changed:
changed = False
neighbors = self.nodes[curr].neighbors.get(l, set())
for neighbor_id in neighbors:
neighbor_dist = self._distance(query, self.nodes[neighbor_id].data)
if neighbor_dist < curr_dist:
curr = neighbor_id
curr_dist = neighbor_dist
changed = True
# Search the bottom layer more thoroughly
candidates = self._search_layer(query, {curr}, self.ef_construction, 0)
# Sort candidates by distance and return k nearest
results = [(c, self._distance(query, self.nodes[c].data)) for c in candidates]
results.sort(key=lambda x: x[1])
return [x[0] for x in results[:k]]
# Example usage
# Create sample data
dim = 3
num_points = 1000
data = np.random.rand(num_points, dim)
# Initialize HNSW index
index = HNSW(dim=dim, M=5, ef_construction=100, max_layers=4)
# Insert vectors
for i, vector in enumerate(data):
index.insert(vector)
# Perform search
query = np.random.rand(dim)
nearest_neighbors = index.search(query, k=5)
print(f"Query vector: {query}")
print(f"Nearest neighbors: {nearest_neighbors}")
Let’s analyze how HNSW works with concrete examples:
Layer Structure Example: Consider a dataset with 1000 points in 3D space:
Layer 3 (top): ~8 points
Layer 2: ~32 points
Layer 1: ~128 points
Layer 0 (bottom): all 1000 points
Insertion Process Example: Let’s insert vector v = [0.5, 0.3, 0.7]:
a) Generate random level (l = 2)
b) Start at entry point: ep = [0.4, 0.4, 0.4] c) Traverse down layers:
Layer 2: ep → [0.45, 0.35, 0.65]
Layer 1: [0.45, 0.35, 0.65] → [0.48, 0.32, 0.68]
Layer 0: [0.48, 0.32, 0.68] → [0.49, 0.31, 0.69] → v
Search Process Example: Query q = [0.52, 0.29, 0.71]:
Step 1 (Layer 2):
- Start at entry point
- Find closest neighbor: [0.45, 0.35, 0.65]
Step 2 (Layer 1):
- Explore neighbors of [0.45, 0.35, 0.65]
- Find closer point: [0.48, 0.32, 0.68]Step 3 (Layer 0):
- Detailed exploration around [0.48, 0.32, 0.68]
- Find exact nearest neighbors
Key Parameters and Their Effects:
M = 5 # Max connections per layer (except layer 0)
M0 = 2*M = 10 # Max connections at layer 0
ef_construction = 100 # Size of dynamic candidate list
Distance Calculations Example: For vectors a = [0.5, 0.3, 0.7] and b = [0.6, 0.2, 0.8]:
distance = sqrt((0.6-0.5)² + (0.2-0.3)² + (0.8-0.7)²)
= sqrt(0.01 + 0.01 + 0.01)
= sqrt(0.03)
≈ 0.173
Layer Probability Distribution: Probability of a node reaching level l:
P(level ≥ l) = exp(-l/M)
For M=5:
P(level ≥ 1) ≈ 0.819
P(level ≥ 2) ≈ 0.670
P(level ≥ 3) ≈ 0.549
The hierarchical structure provides:
- Logarithmic search complexity: O(log N)
- Efficient space complexity: O(N log N)
- High recall rate (typically 0.95–0.99)
- Practical performance advantages over flat index structures
4. Steps to Learn and Implement HNSW
Understand Approximate Nearest Neighbor (ANN) Search:
- Learn about ANN search and how it’s different from exact nearest neighbor search.
- Learn about similarity metrics like Euclidean distance and cosine similarity.
Libraries for HNSW:
- Python has libraries like FAISS (from Facebook) and nmslib that provide efficient implementations of HNSW for vector search.
- Example installation of FAISS:
pip install faiss
- Example installation of nmslib:
pip install nmslib
Implementing HNSW:
- Here’s a simple example of how to build and query an HNSW index using FAISS:
import faiss
import numpy as np
# Number of dimensions (embedding size)
d = 128
# Randomly generate some vectors (for demonstration)
vectors = np.random.random((10000, d)).astype('float32')
# Create HNSW index
index = faiss.IndexHNSWFlat(d, 32) # 32 is the number of neighbors
index.hnsw.efConstruction = 40 # Controls tradeoff between accuracy and index build speed
# Add vectors to index
index.add(vectors)
# Query the index with a new vector
query_vector = np.random.random((1, d)).astype('float32')
D, I = index.search(query_vector, k=5) # Find 5 nearest neighbors
print(f"Nearest neighbors: {I}")
print(f"Distances: {D}")
Key parameters:
- M (in FAISS: number of neighbors in each layer).
- efConstruction: Higher values increase accuracy at the cost of slower index building.
- efSearch: Higher values improve search accuracy during queries at the cost of speed.
5. Best Practices and Optimizations
- Tuning
M
,efConstruction
, andefSearch
parameters for specific datasets can significantly improve performance. - Dimensionality reduction: Applying techniques like PCA to reduce the dimensionality of vectors before indexing can sometimes speed up the process without a major loss in accuracy.
- Memory considerations: HNSW uses more memory than some other ANN algorithms, so memory optimization might be important for very large datasets.
6. Use Cases of HNSW in AI
- Recommendation Systems: Similar item or user searches based on vector representations of user preferences or item features.
- Image Search: Search for similar images by comparing the embedding vectors generated by a CNN.
- Text Search: Search for semantically similar documents or sentences using text embeddings (e.g., from BERT).