Search code examples
phpapichecksumphp-curlbackblaze

Checksum did not match data received error on uploading large file in parts using b2 backblaze native php API


I am using the following documentation for calling the Backblaze B2 API from PHP: https://www.backblaze.com/b2/docs/b2_upload_part.html

Which has the code below:

<?php

    // Used by curl when CURLOPT_READFUNCTION is set
    function myReadFile($curl_rsrc, $file_pointer, $length) {
        return fread($file_pointer, $length);
    }

    // Upload parts
    $minimum_part_size = 100 * (1000 * 1000); // Obtained from b2_authorize_account. The default is 100 MB
    $local_file = "<path to large file>";
    $local_file_size = filesize($local_file);
    $total_bytes_sent = 0;
    $bytes_sent_for_part = 0;
    $bytes_sent_for_part = $minimum_part_size;
    $sha1_of_parts = Array();
    $part_no = 1;
    $file_handle = fopen($local_file, "r");
    while($total_bytes_sent < $local_file_size) {

        // Determine the number of bytes to send based on the minimum part size 
        if (($local_file_size - $total_bytes_sent) < $minimum_part_size) {
            $bytes_sent_for_part = ($local_file_size - $total_bytes_sent);
       }

        // Get a sha1 of the part we are going to send  
        fseek($file_handle, $total_bytes_sent);
        $data_part = fread($file_handle, $bytes_sent_for_part);
        array_push($sha1_of_parts, sha1($data_part));
        fseek($file_handle, $total_bytes_sent);

        // Send it over th wire
        $session = curl_init($upload_url);
        // Add headers
        $headers = array();
        $headers[] = "Accept: application/json";
        $headers[] = "Authorization: " . $large_file_auth_token;
        $headers[] = "Content-Length: " . $bytes_sent_for_part;
        $headers[] = "X-Bz-Part-Number: " . $part_no;
        $headers[] = "X-Bz-Content-Sha1: " . $sha1_of_parts[$part_no - 1];
        curl_setopt($session, CURLOPT_POST, true);
        curl_setopt($session, CURLOPT_HTTPHEADER, $headers);  // Add headers
        curl_setopt($session, CURLOPT_INFILE, $file_handle);
        curl_setopt($session, CURLOPT_INFILESIZE, (int)$bytes_sent_for_part);
        curl_setopt($session, CURLOPT_RETURNTRANSFER, true); // Receive server response
        curl_setopt($session, CURLOPT_READFUNCTION, "myReadFile");
        $server_output = curl_exec($session);
        curl_close ($session);
        print $server_output . "\n";    

        // Prepare for the next iteration of the loop
        $part_no++;
        $total_bytes_sent = $bytes_sent_for_part + $total_bytes_sent;
        $read_file_bytes_read = 0;
    }
    fclose($file_handle);

?>

Which gives me the response:

{
    "code": "bad_request",
    "message": "Checksum did not match data received",
    "status": 400
}

Solution

  • The docs state:

    CURLOPT_INFILESIZE

    The expected size, in bytes, of the file when uploading a file to a remote site. Note that using this option will not stop libcurl from sending more data, as exactly what is sent depends on CURLOPT_READFUNCTION.

    Which seems to imply that curl simply reads to the end of the file every time.

    What you're going to want to do is make the function that you feed to CURLOPT_READFUNCTION fancier and aware of where it should stop reading its chunks.

    To this end I've written a CurlFileChunker class that encapsulates the task and all of its associated functions and variables.

    class CurlFileChunker {
        private $fp;
        protected $chunkSize;
        protected $offset;
        protected $nextStop;
        
        public function __construct($fp, int $chunkSize) {
            $this->fp = $fp;
            $this->chunkSize = $chunkSize;
            $this->offset = ftell($fp);
            $this->nextStop = $this->offset + $this->chunkSize;
        }
        
        protected function getChunk() {
            return fread($this->fp, $this->chunkSize);
        }
        
        protected function reset() {
            fseek($this->fp, $this->offset);
        }
        
        public function eof() {
            return feof($this->fp);
        }
        
        public function getChunkInfo(string $hashMethod) {
            $chunk = $this->getChunk();
            $info = [
                'hash' => hash($hashMethod, $chunk),
                'length' => strlen($chunk)
            ];
            $this->reset();
            return $info;
        }
        
        public function next() {
            $this->nextStop = $this->offset + $this->chunkSize;
        }
    
        public function curlReadFunction($ch, $fp, int $length) {
            if( $fp !== $this->fp ) {
                throw new \Exception('File handle supplied differs from expected.');
            }
    
            // case 1: requested read is still within the chunk, return the requested data.
            if( $this->offset + $length < $this->nextStop ) {
                $out = fread($this->fp, $length);
            // case 2: requested read goes beyond the bounds of the chunk, return data up to the chunk boundary.
            } else if( $this->nextStop - $this->offset > 0 ) {
                $out = fread($this->fp, $this->nextStop - $this->offset);
            // case 3: offset and nextstop are the same, return zero byte string signifying EOF to curl
            } else {
                $out = '';
            }
            $this->offset = ftell($this->fp);
            return $out;
        }
    }
    

    And an example usage, pretending as though we are Curl reading in pieces smaller than the chunk:

    $fp = fopen('php://memory', 'rwb');
    fwrite($fp, 'lorem ipsum dolor sit amet');
    rewind($fp);
    
    $c = new CurlFileChunker($fp, 10);
    
    while( ! $c->eof() ) {
        $info = $c->getChunkInfo('sha1');
        var_dump($info);
        $chunk = '';
        while( $part = $c->curlReadFunction(NULL, $fp, 7) ) {
            $chunk .= $part;
        }
        var_dump($chunk);
        $c->next();
    }
    

    Output:

    array(2) {
      ["hash"]=>
      string(40) "94ae3406c7e5e2ba31208dc623c20d2a107bfec2"
      ["length"]=>
      int(10)
    }
    string(10) "lorem ipsu"
    string(40) "94ae3406c7e5e2ba31208dc623c20d2a107bfec2"
    
    array(2) {
      ["hash"]=>
      string(40) "aebf816b6e13941737d5045c294ffe785ca55733"
      ["length"]=>
      int(10)
    }
    string(10) "m dolor si"
    string(40) "aebf816b6e13941737d5045c294ffe785ca55733"
    
    array(2) {
      ["hash"]=>
      string(40) "21d8e40707fa773b532ae892f82c057e92764f3a"
      ["length"]=>
      int(6)
    }
    string(6) "t amet"
    string(40) "21d8e40707fa773b532ae892f82c057e92764f3a"
    

    and your code becomes roughly:

    $file_handle = fopen($local_file, "r");
    $c = new CurlFileChunker($file_handle, 10 * 1024 * 1024);
    $part_no = 0;
    while( ! $c->eof ) {
        $info = $c->getChunkInfo('sha1');
        
        $session = curl_init($upload_url);
        $headers = array();
        $headers[] = "Accept: application/json";
        $headers[] = "Authorization: " . $large_file_auth_token;
        $headers[] = "Content-Length: " . $info['length'];
        $headers[] = "X-Bz-Part-Number: " . $part_no;
        $headers[] = "X-Bz-Content-Sha1: " . $info['hash'];
        curl_setopt($session, CURLOPT_POST, true);
        curl_setopt($session, CURLOPT_HTTPHEADER, $headers);  // Add headers
        curl_setopt($session, CURLOPT_INFILE, $file_handle);
        curl_setopt($session, CURLOPT_INFILESIZE, $info['length'];
        curl_setopt($session, CURLOPT_RETURNTRANSFER, true); // Receive server response
        curl_setopt($session, CURLOPT_READFUNCTION, [$c, 'curlReadFunction']);
        $server_output = curl_exec($session);
        curl_close ($session);
        
        $c->next();
    }