Spaces:
Sleeping
Sleeping
File size: 6,764 Bytes
30b652f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
"""
FlexChunk - Minimal implementation of optimized data structure for sparse matrix chunks.
Ref: T4, T5, T13
"""
import numpy as np
import os
import struct
from typing import Tuple, Optional
import scipy.sparse as sparse
# Magic number for binary format identification
FLEX_CHUNK_MAGIC = b'FLXCHK01'
class FlexChunk:
"""
A flexible sparse matrix chunk representation optimized for efficient operations.
Ref: T4, T10
"""
def __init__(self,
start_row: int,
num_rows: int,
row_offsets: np.ndarray,
col_indices: np.ndarray,
data: np.ndarray,
shape: Optional[Tuple[int, int]] = None):
"""
Initialize a FlexChunk from raw CSR data
Ref: T4
Args:
start_row: Global starting row index
num_rows: Number of rows in this chunk
row_offsets: CSR row pointer array (length num_rows+1)
col_indices: CSR column indices array
data: CSR data values array
shape: Optional matrix shape (rows, cols). If not provided, will be inferred.
"""
self.start_row = start_row
self.num_rows = num_rows
self.end_row = start_row + num_rows
# Validate row_offsets
if len(row_offsets) != num_rows + 1:
raise ValueError(f"row_offsets must have length {num_rows + 1}, got {len(row_offsets)}")
if not np.all(np.diff(row_offsets) >= 0):
raise ValueError("row_offsets must be monotonically increasing")
# [T4] Preserve structural representation
self.row_offsets = row_offsets
self.col_indices = col_indices
self.data = data
# Determine number of columns
if shape is not None:
self.n_cols = shape[1]
elif len(col_indices) > 0:
# If shape not provided, determine by max column index
self.n_cols = col_indices.max() + 1
else:
self.n_cols = 0
# Save full matrix shape
self.shape = (num_rows, self.n_cols)
# Stats
self.nnz = len(data)
def process_with_vector(self, vector: np.ndarray) -> np.ndarray:
"""
Multiply chunk with a vector
Ref: T5, T13
Args:
vector: Vector to multiply with
Returns:
Result of multiplication
"""
if len(vector) != self.n_cols:
raise ValueError(f"Vector length {len(vector)} does not match matrix columns {self.n_cols}")
# [T5] Skip processing for empty data
if self.nnz == 0:
return np.zeros(self.num_rows, dtype=vector.dtype)
# Create result buffer
result = np.zeros(self.num_rows, dtype=vector.dtype)
# [T13] Optimize computation flow
for i in range(self.num_rows):
start_idx = self.row_offsets[i]
end_idx = self.row_offsets[i+1]
# [T5] Process only non-zero elements
for j in range(start_idx, end_idx):
col = self.col_indices[j]
if col < len(vector):
result[i] += self.data[j] * vector[col]
return result
@classmethod
def from_csr_matrix(cls,
matrix: sparse.csr_matrix,
start_row: int = 0,
end_row: Optional[int] = None) -> 'FlexChunk':
"""
Create a FlexChunk from a CSR matrix (full or slice)
Ref: T4, T9
Args:
matrix: A scipy.sparse.csr_matrix
start_row: Global start row index
end_row: Global end row index (optional)
Returns:
A new FlexChunk
"""
if not sparse.isspmatrix_csr(matrix):
matrix = matrix.tocsr()
if end_row is None:
end_row = start_row + matrix.shape[0]
num_rows = end_row - start_row
if num_rows != matrix.shape[0]:
raise ValueError(f"Matrix shape {matrix.shape} doesn't match row range {start_row}:{end_row}")
# [T4] Maintain data structure integrity
row_offsets = matrix.indptr.copy()
col_indices = matrix.indices.copy()
data = matrix.data.copy()
return cls(
start_row=start_row,
num_rows=num_rows,
row_offsets=row_offsets,
col_indices=col_indices,
data=data,
shape=matrix.shape
)
def save_chunk(chunk: FlexChunk, filepath: str) -> None:
"""
Save a FlexChunk to a binary file.
Ref: T4
Args:
chunk: The FlexChunk to save
filepath: Path to save the file
"""
with open(filepath, 'wb') as f:
# Write the magic number
f.write(FLEX_CHUNK_MAGIC)
# [T4] Store structural representation
f.write(struct.pack('q', chunk.start_row))
f.write(struct.pack('q', chunk.num_rows))
f.write(struct.pack('q', chunk.nnz))
f.write(struct.pack('q', chunk.n_cols))
# Write arrays
f.write(chunk.row_offsets.astype(np.int32).tobytes())
f.write(chunk.col_indices.astype(np.int32).tobytes())
f.write(chunk.data.astype(np.float64).tobytes())
def load_chunk(filepath: str) -> FlexChunk:
"""
Load a FlexChunk from a binary file.
Ref: T4
Args:
filepath: Path to the file
Returns:
Loaded FlexChunk
"""
with open(filepath, 'rb') as f:
# Verify the magic number
magic = f.read(len(FLEX_CHUNK_MAGIC))
if magic != FLEX_CHUNK_MAGIC:
raise ValueError(f"Invalid file format for {filepath}")
# [T4] Restore structural representation
start_row = struct.unpack('q', f.read(8))[0]
num_rows = struct.unpack('q', f.read(8))[0]
nnz = struct.unpack('q', f.read(8))[0]
n_cols = struct.unpack('q', f.read(8))[0]
# Read arrays
row_offsets = np.frombuffer(f.read((num_rows + 1) * 4), dtype=np.int32)
col_indices = np.frombuffer(f.read(nnz * 4), dtype=np.int32)
data = np.frombuffer(f.read(nnz * 8), dtype=np.float64)
# Create the FlexChunk with explicit shape
chunk = FlexChunk(
start_row=start_row,
num_rows=num_rows,
row_offsets=row_offsets,
col_indices=col_indices,
data=data,
shape=(num_rows, n_cols)
)
return chunk |