libzypp 17.31.23
mediablocklist.cc
Go to the documentation of this file.
1/*---------------------------------------------------------------------\
2| ____ _ __ __ ___ |
3| |__ / \ / / . \ . \ |
4| / / \ V /| _/ _/ |
5| / /__ | | | | | | |
6| /_____||_| |_| |_| |
7| |
8\---------------------------------------------------------------------*/
101#include "mediablocklist.h"
102
103#include <sys/types.h>
104#include <stdio.h>
105#include <stdlib.h>
106#include <string.h>
107
108#include <vector>
109#include <iostream>
110#include <fstream>
111
112#include <zypp-core/base/Logger.h>
113#include <zypp-core/base/String.h>
114#include <zypp-core/AutoDispose.h>
115#include <zypp-core/base/Exception.h>
116
117using namespace zypp::base;
118
119namespace zypp {
120 namespace media {
121
122 namespace {
123
124 struct rsum {
125 unsigned short a = 0;
126 unsigned short b = 0;
127 } __attribute__((packed));
128
129 /* rcksum_calc_rsum_block(data, data_len)
130 * Calculate the rsum for a single block of data. */
131 rsum rcksum_calc_rsum_block(const unsigned char *data, size_t len) {
132 unsigned short a = 0;
133 unsigned short b = 0;
134
135 while (len) {
136 unsigned char c = *data++;
137 a += c;
138 b += len * c;
139 len--;
140 }
141 return rsum{ a, b };
142 }
143
144 // update the rsum by removing the old and adding the new char
145 #define UPDATE_RSUM(a, b, oldc, newc, bshift) do { (a) += ((unsigned char)(newc)) - ((unsigned char)(oldc)); (b) += (a) - ((oldc) << (bshift)); } while (0)
146
151 void inline truncateRsum ( unsigned int &rs, const int rsumlen )
152 {
153 switch(rsumlen)
154 {
155 case 3:
156 rs &= 0xffffff;
157 break;
158 case 2:
159 rs &= 0xffff;
160 break;
161 case 1:
162 rs &= 0xff;
163 break;
164 default:
165 break;
166 }
167 }
168
169 }
170
172: filesize(size),
173 haveblocks(false),
174 chksumlen(0),
175 chksumpad(0),
176 rsumlen(0),
177 rsumseq(0),
178 rsumpad(0)
179{ }
180
181size_t
182MediaBlockList::addBlock(off_t off, size_t size)
183{
184 haveblocks = true;
185 blocks.push_back(MediaBlock( off, size ));
186 return blocks.size() - 1;
187}
188
189void
190MediaBlockList::setFileChecksum(std::string ctype, int cl, unsigned char *c)
191{
192 if (!cl)
193 return;
194 fsumtype = ctype;
195 fsum.resize(cl);
196 memcpy(&fsum[0], c, cl);
197}
198
200{
201 return fsumtype;
202}
203
205{
206 return fsum;
207}
208
209bool
211{
212 return digest.create(fsumtype);
213}
214
215bool
217{
218 if (!haveFileChecksum())
219 return true;
220 std::vector<unsigned char>dig = digest.digestVector();
221 if (dig.empty() || dig.size() < fsum.size())
222 return false;
223 return memcmp(&dig[0], &fsum[0], fsum.size()) ? false : true;
224}
225
226void
227MediaBlockList::setChecksum(size_t blkno, std::string cstype, int csl, unsigned char *cs, size_t cspad)
228{
229 if (!csl)
230 return;
231 if (!chksumlen)
232 {
233 if (blkno)
234 return;
235 chksumlen = csl;
236 chksumtype = cstype;
237 chksumpad = cspad;
238 }
239 if (csl != chksumlen || cstype != chksumtype || cspad != chksumpad || blkno != chksums.size() / chksumlen)
240 return;
241 chksums.resize(chksums.size() + csl);
242 memcpy(&chksums[csl * blkno], cs, csl);
243}
244
245void
246MediaBlockList::setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad)
247{
248 if (!rsl)
249 return;
250 if (!rsumlen)
251 {
252 if (blkno)
253 return;
254 rsumlen = rsl;
255 rsumpad = rspad;
256 }
257 if (rsl != rsumlen || rspad != rsumpad || blkno != rsums.size())
258 return;
259 rsums.push_back(rs);
260}
261
262void
264{
265 rsumseq = seq;
266}
267
268bool
270{
271 return digest.create(chksumtype);
272}
273
274bool
275MediaBlockList::verifyDigest(size_t blkno, Digest &digest) const
276{
277 if (!haveChecksum(blkno))
278 return true;
279 size_t size = blocks[blkno].size;
280 if (!size)
281 return true;
282 if (chksumpad > size)
283 {
284 char pad[chksumpad - size];
285 memset(pad, 0, chksumpad - size);
286 digest.update(pad, chksumpad - size);
287 }
288 std::vector<unsigned char>dig = digest.digestVector();
289 if (dig.empty() || dig.size() < size_t(chksumlen))
290 return false;
291 return memcmp(&dig[0], &chksums[chksumlen * blkno], chksumlen) ? false : true;
292}
293
294unsigned int
295MediaBlockList::updateRsum(unsigned int rs, const char* bytes, size_t len) const
296{
297 if (!len)
298 return rs;
299
300 unsigned short s, m;
301 s = (rs >> 16) & 65535;
302 m = rs & 65535;
303 for (; len > 0 ; len--)
304 {
305 unsigned short c = (unsigned char)*bytes++;
306 s += c;
307 m += s;
308 }
309 return (s & 65535) << 16 | (m & 65535);
310}
311
312bool
313MediaBlockList::verifyRsum(size_t blkno, unsigned int rs) const
314{
315 if (!haveRsum(blkno))
316 return true;
317 size_t size = blocks[blkno].size;
318 if (!size)
319 return true;
320 if (rsumpad > size)
321 {
322 unsigned short s, m;
323 s = (rs >> 16) & 65535;
324 m = rs & 65535;
325 m += s * (rsumpad - size);
326 rs = (s & 65535) << 16 | (m & 65535);
327 }
328 truncateRsum( rs, rsumlen );
329 return rs == rsums[blkno];
330}
331
332bool
333MediaBlockList::checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
334{
335 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
336 return false;
337 unsigned int rs = updateRsum(0, (const char *)buf, blocks[blkno].size);
338 return verifyRsum(blkno, rs);
339}
340
341bool
342MediaBlockList::checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
343{
344 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
345 return false;
346 Digest dig;
347 if (!createDigest(dig))
348 return false;
349 dig.update((const char *)buf, blocks[blkno].size);
350 return verifyDigest(blkno, dig);
351}
352
354{
355 if ( !haveChecksum(blkno) )
356 return {};
357
358 UByteArray buf ( chksumlen, '\0' );
359 memcpy( buf.data(), chksums.data()+(chksumlen * blkno), chksumlen );
360 return buf;
361}
362
364{
365 return chksumtype;
366}
367
369{
370 return chksumpad;
371}
372
373// specialized version of checkChecksum that can deal with a "rotated" buffer
374bool
375MediaBlockList::checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
376{
377 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
378 return false;
379 if (start == bufl)
380 start = 0;
381 Digest dig;
382 if (!createDigest(dig))
383 return false;
384 size_t size = blocks[blkno].size;
385 size_t len = bufl - start > size ? size : bufl - start;
386 dig.update((const char *)buf + start, len);
387 if (size > len)
388 dig.update((const char *)buf, size - len);
389 return verifyDigest(blkno, dig);
390}
391
392// write block to the file. can also deal with "rotated" buffers
393void
394MediaBlockList::writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector<bool> &found) const
395{
396 if (blkno >= blocks.size() || bufl < blocks[blkno].size)
397 return;
398 off_t off = blocks[blkno].off;
399 size_t size = blocks[blkno].size;
400 if (fseeko(fp, off, SEEK_SET))
401 return;
402 if (start == bufl)
403 start = 0;
404 size_t len = bufl - start > size ? size : bufl - start;
405 if (fwrite(buf + start, len, 1, fp) != 1)
406 return;
407 if (size > len && fwrite(buf, size - len, 1, fp) != 1)
408 return;
409 found[blkno] = true;
410 found[blocks.size()] = true;
411}
412
413static size_t
414fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
415{
416 size_t l = blksize;
417 int c;
418
419 if (pushback)
420 {
421 if (pushbackp != bp)
422 memmove(bp, pushbackp, pushback);
423 bp += pushback;
424 l -= pushback;
425 }
426 while (l)
427 {
428 c = getc(fp);
429 if (c == EOF)
430 break;
431 *bp++ = c;
432 l--;
433 }
434 if (l)
435 memset(bp, 0, l);
436 return blksize - l;
437}
438
439void MediaBlockList::reuseBlocks(FILE *wfp, std::string filename)
440{
441
443
444 if ( !chksumlen ) {
445 DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
446 return;
447 }
448
449 if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
450 DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
451 return;
452 }
453
454 size_t nblks = blocks.size();
455 std::vector<bool> found( nblks + 1 );
456 if (rsumlen && !rsums.empty()) {
457
458 const auto rsumAMask = rsumlen < 3 ? 0 : rsumlen == 3 ? 0xff : 0xffff;
459
460 // we are building a array of rsum structs to directly access a and b parts of the checksum
461 // we make the array bigger so that the code calculating the rsum hashes does not need to care
462 // about the array size
463 // @TODO move that to the function adding new rsums when removing the old code
464 auto zsyncRsumsData = std::make_unique<rsum[]>( rsums.size() + rsumseq );
465 auto zsyncRsums = zsyncRsumsData.get();
466 for ( std::size_t i = 0; i < rsums.size(); i++ ) {
467 const auto &rs = rsums[i];
468 unsigned short s, m;
469 s = (rs >> 16) & 65535;
470 m = rs & 65535;
471 zsyncRsums[i] = rsum{ s, m };
472 }
473
474 // we use the same code as zsync to calc the hash
475 const auto & calc_rhash = [&]( const rsum* e ) -> unsigned {
476 unsigned h = e[0].b;
477 if ( this->rsumseq > 1 ) {
478 for ( uint i = 1; i < rsumseq; i++ ) {
479 h ^= e[i].b << 3;
480 }
481 } else {
482 h ^= ( e[0].a & rsumAMask ) << 3;
483 }
484 return h;
485 };
486
487 size_t blksize = blocks[0].size;
488 if (nblks == 1 && rsumpad && rsumpad > blksize)
489 blksize = rsumpad;
490
491 // create hash of checksums
492 // build the hashtable
493 uint rsumHashMask;
494 {
495 int i = 16;
496
497 /* Try hash size of 2^i; step down the value of i until we find a good size */
498 while ((2 << (i - 1)) > nblks && i > 4)
499 i--;
500
501 /* Allocate hash based on rsum */
502 rsumHashMask = (2 << i) - 1;
503 }
504
505 // a array indexed via hash with lists of blocks resulting in the same rsum hash
506 auto rsumHashTableData = std::make_unique<std::vector<size_t>[]>( rsumHashMask + 1 );
507 auto rsumHashTable = rsumHashTableData.get();
508
509 // Now fill in the hash tables.
510 for ( size_t id = 0; id < nblks; id++) {
511 const auto hash = calc_rhash( &zsyncRsums[id] );
512 auto &hashList = rsumHashTable[ hash & rsumHashMask ];
513 hashList.push_back(id);
514 }
515
516 // we read in 16 sequences at once to speed up processing
517 constexpr auto BLOCKCNT = 16;
518
519 // we allocate the buffer so that we always have the data to verify 16 blocks, if we need to do
520 // sequence matching we grow the buffer accordingly
521 const auto readBufSize = blksize * rsumseq * BLOCKCNT;
522
523 // buffer thats going to hold our cached data
524 auto readBufData = std::make_unique<unsigned char[]>( readBufSize );
525 memset(readBufData.get(), 0, blksize);
526
527 // avoid using .get() all the time
528 auto readBuf = readBufData.get();
529
530 // our running checksums for the blocks we need to match in sequence
531 auto seqRsumsData = std::make_unique<rsum[]> ( rsumseq );
532 auto seqRsums = seqRsumsData.get();
533
534 // use byteshift instead of multiplication if the blksize is a power of 2
535 // a value is a power of 2 if ( N & N-1 ) == 0
536 int bshift = 0; // how many bytes do we need to shift
537 if ((blksize & (blksize - 1)) == 0)
538 for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
539 ;
540
541 bool init = true;
542 // when we are in a run of matches, we remember which block ID would need to match next in order
543 // to continue writing
544 std::optional<size_t> nextReqMatchInSequence;
545 off_t dataOffset = 0; //< Our current read offset in the buffer
546 off_t dataLen = 0; //< The length of our read buffer
547
548 // helper lambda that follows a list of hashmap entries and tries to write those that match
549 const auto &tryWriteMatchingBlocks = [&]( const std::vector<size_t> &list, const u_char *currBuf, uint reqMatches ){
550 // the number of blocks we have transferred to the target file
551 int targetBlocksWritten = 0;
552
553 // reset the next match hint
554 nextReqMatchInSequence.reset();
555
556 for ( const auto blkno : list ) {
557
558 if ( found[blkno] )
559 continue;
560
561 const auto blockRsum = &zsyncRsums[blkno];
562
563 uint weakMatches = 0;
564
565 // first check only the current block, we maybe can skip checking the others
566 // if we are in a run of matches
567 if ( (seqRsums[0].a & rsumAMask) != blockRsum[0].a ||
568 seqRsums[0].b != blockRsum[0].b )
569 continue;
570
571 weakMatches++;
572
573 for ( uint i = 1; i < reqMatches; i++ ) {
574 if ( (seqRsums[i].a & rsumAMask) != blockRsum[i].a ||
575 seqRsums[i].b != blockRsum[i].b )
576 break;
577 weakMatches++;
578 }
579
580 if ( weakMatches < reqMatches )
581 continue;
582
583 // we have a weak match, now we need to calc the checksums for the blocks
584 uint realMatches = 0;
585 for( uint i = 0; i < reqMatches; i++ ) {
586 if ( !checkChecksum(blkno + i, currBuf + ( i * blksize ), blksize ) ) {
587 break;
588 }
589 realMatches++;
590 }
591
592 // check if we have the amount of matches we need ( only 1 if we are in a block sequence )
593 if( realMatches < reqMatches )
594 continue;
595
596 // we found blocks that match , write them to target but keep searching the hashmap
597 // in case we have redundancies
598 const auto nextPossibleMatch = blkno + realMatches;
599 if ( !found[nextPossibleMatch] )
600 nextReqMatchInSequence = nextPossibleMatch; // remember that we are currently in a run of matches, next iteration we just need to look at one block
601
602 for( uint i = 0; i < realMatches; i++ ) {
603 writeBlock( blkno + i, wfp, currBuf + ( i * blksize ), blksize, 0, found );
604 targetBlocksWritten++;
605 }
606 }
607 return targetBlocksWritten;
608 };
609
610 if (!rsumseq)
611 rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
612
613 const off_t seqMatchLen = ( blksize * rsumseq ); //< how many bytes do we need to match when searching a block
614
615 while (! feof(fp) ) {
616 if ( init ) {
617 // fill the buffer for the first time
618 dataLen = fread( readBuf, 1, readBufSize, fp );
619 init = false;
620 } else {
621 // move the remaining data to the begin and read from the file to fill up the buffer again
622 const auto remainLen = dataLen-dataOffset;
623 if ( remainLen )
624 memmove( readBuf, readBuf+dataOffset, remainLen );
625
626 dataLen = fread( readBuf+remainLen, 1, readBufSize-remainLen, fp );
627 dataLen += remainLen;
628 dataOffset = 0;
629 }
630
631 // if we hit eof, pad with zeros
632 if ( feof(fp) ) {
633 memset( readBuf + dataLen, 0, readBufSize - dataLen );
634 dataLen = readBufSize;
635 }
636
637 if ( dataLen < seqMatchLen )
638 return;
639
640 // intialize our first set of checksums
641 for( uint i = 0; i < rsumseq; i++ )
642 seqRsums[i] = rcksum_calc_rsum_block( readBuf + ( i * blksize ), blksize );
643
644 //read over the buffer we have allocated so far
645 while ( true ) {
646
647 if ( dataOffset + seqMatchLen > dataLen )
648 break;
649
650 u_char *currBuf = readBuf + dataOffset;
651
652 // the number of deltafile blocks we have matched, e.g. how much blocks
653 // can we skip forward
654 uint deltaBlocksMatched = 0;
655
656 if ( nextReqMatchInSequence.has_value() ) {
657 if ( tryWriteMatchingBlocks( { *nextReqMatchInSequence }, currBuf, 1 ) > 0 )
658 deltaBlocksMatched = 1;
659
660 } else {
661 const auto hash = calc_rhash( seqRsums );
662
663 // reference to the list of blocks that share our calculated hash
664 auto &blockListForHash = rsumHashTable[ hash & rsumHashMask ];
665 if ( blockListForHash.size() ) {
666 if ( tryWriteMatchingBlocks( blockListForHash, currBuf, rsumseq ) > 0 )
667 deltaBlocksMatched = rsumseq;
668 }
669 }
670
671 if ( deltaBlocksMatched > 0 ) {
672 // we jump forward in the buffer to after what we matched
673 dataOffset += ( deltaBlocksMatched * blksize );
674
675 if ( dataOffset + seqMatchLen > dataLen )
676 break;
677
678 if ( deltaBlocksMatched < rsumseq ) {
679 //@TODO move the rsums we already have
680 }
681
682 for( uint i = 0; i < rsumseq; i++ )
683 seqRsums[i] = rcksum_calc_rsum_block( readBuf + dataOffset + ( i * blksize ), blksize );
684
685
686 } else {
687 // we found nothing advance the window by one byte and update the rsums
688 dataOffset++;
689 if ( dataOffset + seqMatchLen > dataLen )
690 break;
691 for ( uint i = 0; i < rsumseq; i++ ) {
692 const auto blkOff = ( i*blksize );
693 u_char oldC = (currBuf + blkOff)[0];
694 u_char newC = (currBuf + blkOff)[blksize];
695 UPDATE_RSUM( seqRsums[i].a, seqRsums[i].b, oldC, newC, bshift );
696 }
697 }
698 }
699 }
700 }
701 else if (chksumlen >= 16)
702 {
703 // dummy variant, just check the checksums
704 size_t bufl = 4096;
705 off_t off = 0;
706 auto buf = std::make_unique<unsigned char[]>( bufl );
707 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
708 {
709 if (off > blocks[blkno].off)
710 continue;
711 size_t blksize = blocks[blkno].size;
712 if (blksize > bufl)
713 {
714 bufl = blksize;
715 buf = std::make_unique<unsigned char[]>( bufl );
716 }
717 size_t skip = blocks[blkno].off - off;
718 while (skip)
719 {
720 size_t l = skip > bufl ? bufl : skip;
721 if (fread(buf.get(), l, 1, fp) != 1)
722 break;
723 skip -= l;
724 off += l;
725 }
726 if (fread(buf.get(), blksize, 1, fp) != 1)
727 break;
728 if (checkChecksum(blkno, buf.get(), blksize))
729 writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
730 off += blksize;
731 }
732 }
733 if (!found[nblks]) {
734 DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
735 return;
736 }
737 // now throw out all of the blocks we found
738 std::vector<MediaBlock> nblocks;
739 std::vector<unsigned char> nchksums;
740 std::vector<unsigned int> nrsums;
741
742 size_t originalSize = 0;
743 size_t newSize = 0;
744 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
745 {
746 const auto &blk = blocks[blkno];
747 originalSize += blk.size;
748 if (!found[blkno])
749 {
750 // still need it
751 nblocks.push_back(blk);
752 newSize += blk.size;
753 if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
754 {
755 nchksums.resize(nblocks.size() * chksumlen);
756 memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
757 }
758 if (rsumlen && (blkno + 1) <= rsums.size())
759 nrsums.push_back(rsums[blkno]);
760 }
761 }
762 DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
763 << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
764 blocks = nblocks;
765 chksums = nchksums;
766 rsums = nrsums;
767}
768
769void MediaBlockList::reuseBlocksOld(FILE *wfp, std::string filename)
770{
771
773
774 if ( !chksumlen ) {
775 DBG << "Delta XFER: Can not reuse blocks because we have no chksumlen" << std::endl;
776 return;
777 }
778
779 if ( (fp = fopen(filename.c_str(), "r")) == 0 ) {
780 DBG << "Delta XFER: Can not reuse blocks, unable to open file "<< filename << std::endl;
781 return;
782 }
783 size_t nblks = blocks.size();
784 std::vector<bool> found;
785 found.resize(nblks + 1);
786 if (rsumlen && !rsums.empty())
787 {
788 size_t blksize = blocks[0].size;
789 if (nblks == 1 && rsumpad && rsumpad > blksize)
790 blksize = rsumpad;
791
792 // create hash of checksums
793
794 // calculate the size of the hashtable by setting
795 // all bits to 1 up the to currently set MSB
796 // if we have 00010010 we end up with 00011111
797 unsigned int hm = rsums.size() * 2;
798 while (hm & (hm - 1)) {
799 hm &= hm - 1;
800 }
801 hm = hm * 2 - 1;
802
803 // we want at least a size if 0011 1111 1111 1111
804 if (hm < 16383)
805 hm = 16383;
806
807 // simple hashtable of checksums
808 auto rsumHashTable = std::make_unique<unsigned int[]>( hm+1 );
809 memset(rsumHashTable.get(), 0, (hm + 1) * sizeof(unsigned int));
810
811 // insert each rsum into the hash table
812 for (unsigned int i = 0; i < rsums.size(); i++)
813 {
814 if (blocks[i].size != blksize && (i != nblks - 1 || rsumpad != blksize))
815 continue;
816 unsigned int r = rsums[i];
817 unsigned int h = r & hm;
818 unsigned int hh = 7;
819 while (rsumHashTable[h])
820 h = (h + hh++) & hm;
821 rsumHashTable[h] = i + 1;
822 }
823
824 // read in block by block to find matches
825 // the read buffer "buf" works like a ring buffer, means that once we are done with reading a full block
826 // and didn't find a match we start again at buf[0] , filling the buffer up again, rotating until we find
827 // a matching block. Once we find a matching block all we need to do is check if the current offset "i"
828 // is at the end of the buffer, then we can simply write the full buffer out, or if its somewhere in between
829 // then the begin of our block is buf[i+1, bufsize-1] and the end buf[0,i]
830 auto ringBuf = std::make_unique<unsigned char[]>( blksize );
831
832 // we use a second buffer to read in the next block if we are required to match more than one block at the same time.
833 auto buf2 = std::make_unique<unsigned char[]>( blksize );
834
835 // when we are required to match more than one block, it is read into buf2 advancing the file pointer,
836 // to make sure that we do not loose those bytes in case the match fails we remember their count and
837 // start in buf2, in the next loop those will be consumed before reading from the file again
838 size_t pushback = 0;
839 unsigned char *pushbackp = 0;
840
841 // use byteshift instead of multiplication if the blksize is a power of 2
842 // a value is a power of 2 if ( N & N-1 ) == 0
843 int bshift = 0; // how many bytes do we need to shift
844 if ((blksize & (blksize - 1)) == 0)
845 for (bshift = 0; size_t(1 << bshift) != blksize; bshift++)
846 ;
847
848 // a and b are the LS and MS bytes of the checksum, calculated a rolling style Adler32 checksum
849 //
850 // a(k,l) = (\sum_{i=k}^l X_i) \bmod M
851 unsigned short a, b;
852 a = b = 0;
853 memset(ringBuf.get(), 0, blksize);
854 bool eof = 0;
855 bool init = 1;
856
857 if (!rsumseq)
858 rsumseq = nblks > 1 && chksumlen < 16 ? 2 : 1;
859
860 while (!eof)
861 {
862 for (size_t i = 0; i < blksize; i++)
863 {
864 // get the next character from the file
865 // or if there are pushback chars use those
866 int c;
867 if (eof)
868 c = 0;
869 else
870 {
871 if (pushback)
872 {
873 c = *pushbackp++;
874 pushback--;
875 }
876 else
877 c = getc(fp);
878 if (c == EOF)
879 {
880 eof = true;
881 c = 0;
882 if (!i || rsumseq == 2)
883 break;
884 }
885 }
886
887 // calculate the rsum on the fly using recurrence relations, see https://rsync.samba.org/tech_report/node3.html
888 // basically we subtract the checksum value of a byte the leaves the current block window and add the new incoming one
889 // using this trick we do not need to calculate the full block checksum
890 // the least significant part of the checksum ( lower 8 bits ) is simply the sum of all chars in the block , modulo 2^16
891 // zsync uses only a 16bit type to calculate the sums and as far as i can see does not do the modulo per block as the formula
892 // says it should, we might need to do the same
893 int oc = ringBuf[i];
894 ringBuf[i] = c;
895
896 a += c - oc;
897
898 // this is calculates the most significant part of the checksum, bshift should be always set since blocksize
899 // should always be a power of 2
900 if (bshift)
901 b += a - ( oc << bshift );
902 else
903 // This seems to make no sense it does not even factor in the character itself
904 b += 2 * blksize;
905
906 if (init)
907 {
908 // continue reading bytes until we have the full block in our buffer
909 if (size_t(i) != blksize - 1)
910 continue;
911 init = 0;
912 }
913
914 unsigned int r = ((unsigned int)a & 65535) << 16 | ((unsigned int)b & 65535);
915 truncateRsum(r, rsumlen);
916
917 unsigned int h = r & hm;
918 unsigned int hh = 7;
919
920 // go through our hashmap to find all the matching rsums
921 for (; rsumHashTable[h]; h = (h + hh++) & hm)
922 {
923 size_t blkno = rsumHashTable[h] - 1;
924
925 // does the current block match?
926 if (rsums[blkno] != r)
927 continue;
928 if (found[blkno])
929 continue;
930
931 // if we need to always match 2 blocks in sequence, get the next block
932 // and check its checksum
933 if (rsumseq == 2)
934 {
935 if (eof || blkno + 1 >= nblks)
936 continue;
937 pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
938 pushbackp = buf2.get();
939 if (!pushback)
940 continue;
941
942 if (!checkRsum(blkno + 1, buf2.get(), blksize))
943 continue;
944 }
945
946 // here we have matched all blocks that we need, do the heavy checksum
947 if (!checkChecksumRotated(blkno, ringBuf.get(), blksize, i + 1))
948 continue;
949
950 // heavy checksum for second block
951 if (rsumseq == 2 && !checkChecksum(blkno + 1, buf2.get(), blksize))
952 continue;
953
954 // write the first and second blocks if applicable
955 writeBlock(blkno, wfp, ringBuf.get(), blksize, i + 1, found);
956 if (rsumseq == 2)
957 {
958 writeBlock(blkno + 1, wfp, buf2.get(), blksize, 0, found);
959 pushback = 0;
960 blkno++;
961 }
962
963 // try to continue as long as we still match blocks
964 while (!eof)
965 {
966 blkno++;
967 pushback = fetchnext(fp, buf2.get(), blksize, pushback, pushbackp);
968 pushbackp = buf2.get();
969 if (!pushback)
970 break;
971
972 if (!checkRsum(blkno, buf2.get(), blksize))
973 break;
974
975 if (!checkChecksum(blkno, buf2.get(), blksize))
976 break;
977
978 writeBlock(blkno, wfp, buf2.get(), blksize, 0, found);
979 pushback = 0;
980 }
981
982 // if we get to this part we found at least a block, skip over the current block and start reading
983 // in a full block
984 init = true;
985 memset(ringBuf.get(), 0, blksize);
986 a = b = 0;
987 i = size_t(-1); // start with 0 on next iteration
988 break;
989 }
990 }
991 }
992 }
993 else if (chksumlen >= 16)
994 {
995 // dummy variant, just check the checksums
996 size_t bufl = 4096;
997 off_t off = 0;
998 auto buf = std::make_unique<unsigned char[]>( bufl );
999 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1000 {
1001 if (off > blocks[blkno].off)
1002 continue;
1003 size_t blksize = blocks[blkno].size;
1004 if (blksize > bufl)
1005 {
1006 bufl = blksize;
1007 buf = std::make_unique<unsigned char[]>( bufl );
1008 }
1009 size_t skip = blocks[blkno].off - off;
1010 while (skip)
1011 {
1012 size_t l = skip > bufl ? bufl : skip;
1013 if (fread(buf.get(), l, 1, fp) != 1)
1014 break;
1015 skip -= l;
1016 off += l;
1017 }
1018 if (fread(buf.get(), blksize, 1, fp) != 1)
1019 break;
1020 if (checkChecksum(blkno, buf.get(), blksize))
1021 writeBlock(blkno, wfp, buf.get(), blksize, 0, found);
1022 off += blksize;
1023 }
1024 }
1025 if (!found[nblks]) {
1026 DBG << "Delta XFER: No reusable blocks found for " << filename << std::endl;
1027 return;
1028 }
1029 // now throw out all of the blocks we found
1030 std::vector<MediaBlock> nblocks;
1031 std::vector<unsigned char> nchksums;
1032 std::vector<unsigned int> nrsums;
1033
1034 size_t originalSize = 0;
1035 size_t newSize = 0;
1036 for (size_t blkno = 0; blkno < blocks.size(); ++blkno)
1037 {
1038 const auto &blk = blocks[blkno];
1039 originalSize += blk.size;
1040 if (!found[blkno])
1041 {
1042 // still need it
1043 nblocks.push_back(blk);
1044 newSize += blk.size;
1045 if (chksumlen && (blkno + 1) * chksumlen <= chksums.size())
1046 {
1047 nchksums.resize(nblocks.size() * chksumlen);
1048 memcpy(&nchksums[(nblocks.size() - 1) * chksumlen], &chksums[blkno * chksumlen], chksumlen);
1049 }
1050 if (rsumlen && (blkno + 1) <= rsums.size())
1051 nrsums.push_back(rsums[blkno]);
1052 }
1053 }
1054 DBG << "Delta XFER: Found blocks to reuse, " << blocks.size() << " vs " << nblocks.size() << ", resused blocks: " << blocks.size() - nblocks.size() << "\n"
1055 << "Old transfer size: " << originalSize << " new size: " << newSize << std::endl;
1056 blocks = nblocks;
1057 chksums = nchksums;
1058 rsums = nrsums;
1059}
1060
1061std::string
1063{
1064 std::string s;
1065 size_t i, j;
1066
1067 if (filesize != off_t(-1))
1068 {
1069 long long size = filesize;
1070 s = zypp::str::form("[ BlockList, file size %lld\n", size);
1071 }
1072 else
1073 s = "[ BlockList, filesize unknown\n";
1074 if (!haveblocks)
1075 s += " No block information\n";
1076 if (chksumpad)
1077 s += zypp::str::form(" Checksum pad %zd\n", chksumpad);
1078 if (rsumpad)
1079 s += zypp::str::form(" Rsum pad %zd\n", rsumpad);
1080 for (i = 0; i < blocks.size(); ++i)
1081 {
1082 long long off=blocks[i].off;
1083 long long size=blocks[i].size;
1084 s += zypp::str::form(" (%8lld, %8lld)", off, size);
1085 if (chksumlen && chksums.size() >= (i + 1) * chksumlen)
1086 {
1087 s += " " + chksumtype + ":";
1088 for (j = 0; j < size_t(chksumlen); j++)
1089 s += zypp::str::form("%02hhx", chksums[i * chksumlen + j]);
1090 }
1091 if (rsumlen && rsums.size() > i)
1092 {
1093 s += " RSUM:";
1094 s += zypp::str::form("%0*x", 2 * rsumlen, rsums[i]);
1095 }
1096 s += "\n";
1097 }
1098 s += "]";
1099 return s;
1100}
1101
1102 } // namespace media
1103} // namespace zypp
Compute Message Digests (MD5, SHA1 etc)
Definition: Digest.h:37
UByteArray digestVector()
get vector of unsigned char representation of the digest
Definition: Digest.cc:261
bool update(const char *bytes, size_t len)
feed data into digest computation algorithm
Definition: Digest.cc:279
bool create(const std::string &name)
initialize creation of a new message digest
Definition: Digest.cc:173
std::vector< unsigned int > rsums
void setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad=0)
set / verify the (weak) rolling checksum over a single block
bool haveChecksum(size_t blkno) const
void setFileChecksum(std::string ctype, int cl, unsigned char *c)
set / verify the checksum over the whole file
bool verifyRsum(size_t blkno, unsigned int rs) const
void reuseBlocks(FILE *wfp, std::string filename)
const UByteArray & getFileChecksum()
void writeBlock(size_t blkno, FILE *fp, const unsigned char *buf, size_t bufl, size_t start, std::vector< bool > &found) const
bool createDigest(Digest &digest) const
std::string asString() const
return block list as string
void setRsumSequence(uint seq)
how many blocks in sequence need to have the correct checksums to be considered a match
UByteArray getChecksum(size_t blkno) const
bool checkChecksumRotated(size_t blkno, const unsigned char *buf, size_t bufl, size_t start) const
std::vector< unsigned char > chksums
size_t addBlock(off_t off, size_t size)
add a block with offset off and size size to the block list.
unsigned int updateRsum(unsigned int rs, const char *bytes, size_t len) const
void setChecksum(size_t blkno, std::string cstype, int csl, unsigned char *cs, size_t cspad=0)
set / verify the (strong) checksum over a single block
bool verifyDigest(size_t blkno, Digest &digest) const
bool checkRsum(size_t blkno, const unsigned char *buf, size_t bufl) const
std::vector< MediaBlock > blocks
std::string getChecksumType() const
bool createFileDigest(Digest &digest) const
bool verifyFileDigest(Digest &digest) const
void reuseBlocksOld(FILE *wfp, std::string filename)
scan a file for blocks from our blocklist.
std::string fileChecksumType() const
bool haveRsum(size_t blkno) const
bool checkChecksum(size_t blkno, const unsigned char *buf, size_t bufl) const
MediaBlockList(off_t filesize=off_t(-1))
#define UPDATE_RSUM(a, b, oldc, newc, bshift)
unsigned short a
unsigned short b
static size_t fetchnext(FILE *fp, unsigned char *bp, size_t blksize, size_t pushback, unsigned char *pushbackp)
std::string form(const char *format,...) __attribute__((format(printf
Printf style construction of std::string.
Definition: String.cc:36
Easy-to use interface to the ZYPP dependency resolver.
Definition: CodePitfalls.doc:2
AutoDispose<FILE*> calling ::fclose
Definition: AutoDispose.h:313
a single block from the blocklist, consisting of an offset and a size
#define DBG
Definition: Logger.h:95