| name | gpu-memory |
| description | GPU memory management for CuPy. Use when optimizing CPU/GPU data transfers, avoiding out-of-memory errors, implementing memory-efficient pipelines, or managing GPU memory pools. |
GPU Memory Management
Checking Memory Status
import cupy as cp
# Memory pool status
mempool = cp.get_default_memory_pool()
print(f"Used: {mempool.used_bytes() / 1024**3:.2f} GB")
print(f"Total: {mempool.total_bytes() / 1024**3:.2f} GB")
# Device memory info
free_mem, total_mem = cp.cuda.Device().mem_info
print(f"Free: {free_mem / 1024**3:.2f} GB")
Memory Pool and Transfer
# Memory pool management
mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
mempool.set_limit(size=4 * 1024**3) # 4 GB limit
# BAD: Many small transfers
for i in range(1000):
x_gpu = cp.asarray(data[i])
# GOOD: One large transfer
data_gpu = cp.asarray(data)
# Async transfers
stream = cp.cuda.Stream()
with stream:
data_gpu = cp.asarray(data)
stream.synchronize()
# Overlap transfer and compute
with cp.cuda.Stream():
batch1_gpu = cp.asarray(batch1)
with cp.cuda.Stream():
result = process(batch0_gpu)
Avoiding Out-of-Memory
# Check before allocation
def safe_allocate(shape, dtype=cp.float64):
needed = np.prod(shape) * cp.dtype(dtype).itemsize
free, _ = cp.cuda.Device().mem_info
if needed > free * 0.9:
raise MemoryError(f"Need {needed/1e9:.2f} GB, only {free/1e9:.2f} GB free")
return cp.empty(shape, dtype=dtype)
# Process in chunks
def chunked_process(data, chunk_size=10000):
results = []
for i in range(0, len(data), chunk_size):
chunk = cp.asarray(data[i:i+chunk_size])
results.append(process(chunk).get())
del chunk
cp.get_default_memory_pool().free_all_blocks()
return results
Memory-Efficient Patterns
# Use in-place operations
result = cp.empty_like(a)
cp.add(a, b, out=result) # Avoid temporary arrays
# Reuse buffers
buffer = cp.empty((1000, 1000))
for batch in batches:
cp.multiply(batch, 2, out=buffer)
# Use smaller dtypes (float32 vs float64 = 2x less memory)
data_f32 = cp.random.rand(10000, 10000).astype(cp.float32)
# Multi-GPU distribution
def multi_gpu_process(data, n_gpus=2):
results = []
for gpu_id in range(n_gpus):
with cp.cuda.Device(gpu_id):
chunk = data[gpu_id * len(data)//n_gpus : (gpu_id+1) * len(data)//n_gpus]
results.append(process(cp.asarray(chunk)).get())
return results