I have an autodiff tensor library in Rust where I implement Tensor
as:
use std::cell::{RefCell, RefMut};
use std::collections::HashSet;
use std::hash::{Hash, Hasher};
use std::ops::{Deref, DerefMut, Index, IndexMut, AddAssign, Add};
use std::rc::Rc;
use uuid::Uuid;
use num_traits::Zero;
pub struct TensorData {
pub data: NdArray<f64, 2>,
pub grad: NdArray<f64, 2>,
pub uuid: Uuid,
backward: Option<fn(&TensorData)>,
prev: Vec<Tensor>,
op: Option<String>
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd)]
pub struct NdArray<T: Clone, const N: usize> {
pub shape: [usize; N],
pub data: Vec<T>,
}
impl<T: Clone, const N: usize> NdArray<T, N> {
pub fn new(array: &[T], shape: [usize; N]) -> Self {
NdArray {
shape,
data: array.to_vec(),
}
}
pub fn zeros(shape: [usize; N]) -> Self
where
T: Clone + Zero,
{
NdArray {
shape,
data: vec![T::zero(); shape.iter().product()],
}
}
pub fn fill(&mut self, val: T) {
self.data = vec![val; self.shape.iter().product()]
}
fn get_index(&self, idx: &[usize; N]) -> usize {
let mut i = 0;
for j in 0..self.shape.len() {
if idx[j] >= self.shape[j] {
println!("Index {} is out of bounds for dimension {} with size {}", idx[j], j, self.shape[j])
}
i = i * self.shape[j] + idx[j];
}
i
}
pub fn transpose(mut self) -> NdArray<T, N> {
self.shape.reverse();
self
}
}
impl<T: Clone, const N: usize> Index<&[usize; N]> for NdArray<T, N> {
type Output = T;
fn index(&self, idx: &[usize; N]) -> &T {
let i = self.get_index(idx);
&self.data[i]
}
}
impl<T: Clone, const N: usize> IndexMut<&[usize; N]> for NdArray<T, N> {
fn index_mut(&mut self, idx: &[usize; N]) -> &mut T {
let i = self.get_index(idx);
&mut self.data[i]
}
}
impl NdArray<f64, 2> {
/// Finds the matrix product of 2 matrices
pub fn matmul(&self, b: &NdArray<f64, 2>) -> NdArray<f64, 2> {
assert_eq!(self.shape[1], b.shape[0]);
let mut res: NdArray<f64, 2> = NdArray::zeros([self.shape[0], b.shape[1]]);
for row in 0..self.shape[0] {
for col in 0..b.shape[1] {
for el in 0..b.shape[0] {
res[&[row, col]] += self[&[row, el]] * b[&[el, col]]
}
}
}
res
}
}
impl<T: Clone + Add<Output = T>, const N: usize> AddAssign<NdArray<T, N>> for NdArray<T, N> {
fn add_assign(&mut self, rhs: NdArray<T, N>) {
let sum_vec: Vec<T> = self
.data
.iter()
.zip(&rhs.data)
.map(|(a, b)| a.clone() + b.clone())
.collect();
self.data = sum_vec;
}
}
impl Deref for Tensor {
type Target = Rc<RefCell<TensorData>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Tensor {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Hash for Tensor {
fn hash<H: Hasher>(&self, state: &mut H) {
self.borrow().uuid.hash(state);
}
}
impl PartialEq for Tensor {
fn eq(&self, other: &Self) -> bool {
self.borrow().uuid == other.borrow().uuid
}
}
impl Eq for Tensor {}
#[derive(Clone)]
pub struct Tensor(Rc<RefCell<TensorData>>);
impl TensorData {
fn new(data: NdArray<f64, 2>) -> TensorData {
let shape = data.shape;
TensorData {
data,
grad: NdArray::zeros(shape),
uuid: Uuid::new_v4(),
backward: None,
prev: Vec::new(),
op: None
}
}
}
impl Tensor {
pub fn new(array: NdArray<f64, 2>) -> Tensor {
Tensor(Rc::new(RefCell::new(TensorData::new(array))))
}
pub fn shape(&self) -> [usize; 2] {
self.borrow().data.shape
}
pub fn grad(&self) -> NdArray<f64, 2> {
self.borrow().grad.clone()
}
pub fn grad_mut(&self) -> impl DerefMut<Target = NdArray<f64, 2>> + '_ {
RefMut::map((*self.0).borrow_mut(), |mi| &mut mi.grad)
}
pub fn backward(&self) {
let mut topo: Vec<Tensor> = vec![];
let mut visited: HashSet<Tensor> = HashSet::new();
self._build_topo(&mut topo, &mut visited);
topo.reverse();
self.grad_mut().fill(1.0);
for v in topo {
if let Some(backprop) = v.borrow().backward {
backprop(&v.borrow());
}
}
}
fn _build_topo(&self, topo: &mut Vec<Tensor>, visited: &mut HashSet<Tensor>) {
if visited.insert(self.clone()) {
self.borrow().prev.iter().for_each(|child| {
child._build_topo(topo, visited);
});
topo.push(self.clone());
}
}
pub fn matmul(&self, rhs: &Tensor) -> Tensor {
let a_shape = self.shape();
let b_shape = rhs.shape();
assert_eq!(a_shape[1], b_shape[0]);
let res: NdArray<f64, 2> = self.borrow().data.matmul(&rhs.borrow().data);
let out = Tensor::new(res);
out.borrow_mut().prev = vec![self.clone(), rhs.clone()];
out.borrow_mut().op = Some(String::from("matmul"));
out.borrow_mut().backward = Some(|value: &TensorData| {
let lhs = value.prev[0].borrow().data.clone();
let rhs = value.prev[1].borrow().data.clone();
let da = value.grad.clone().matmul(&rhs.transpose());
let db = lhs.transpose().matmul(&value.grad.clone());
value.prev[0].borrow_mut().grad += da;
value.prev[1].borrow_mut().grad += db;
});
out
}
}
fn main() {
let a_array = NdArray::new(&[1.0, 2.0, 3.0, 4.0], [2, 2]);
let a = Tensor::new(a_array);
let b = a.matmul(&a);
b.backward();
println!("Grad: {:?}", a.grad())
}
That is, each Tensor<N>
wraps an NdArray containing its data and an NdArray containing its gradient. The core NdArray type contains basic array operations, including matmul()
, dot()
, etc.
I am trying to implement the matmul()
method for autodiff, and this is what I have so far. I tested this matmul
implementation, but this matmul
implementation gives the incorrect gradient of:
[[7.0 9.0 ]
[11.0 13.0]]
Rather than the correct gradient of:
[[7.0 11.0]
[9.0 13.0]]
What part of the matmul implementation is incorrect?
I found the issue - it was in my implementation of transpose()
. I implemented a new transpose()
for NdArray<f64, 2>
. The working, correct implementation of transpose()
is:
impl NdArray<f64, 2> {
pub fn transpose(&self) -> NdArray<f64, 2> {
let mut shape = self.shape.clone();
shape.reverse();
let mut result = NdArray::zeros(shape);
for i in 0..shape[0] {
for j in 0..shape[1] {
result[&[i, j]] = self[&[j, i]];
}
}
result
}
}
With this change to the code the gradients are accurate.