bdiff: don't check border condition in loop
This is pretty much a copy of d500ddae7494, just to a different loop.
The condition `p == plast` (`plast == a + len - 1`) was only true on
the final iteration of the loop. So it was wasteful to check for it
on every iteration. We decrease the iteration count by 1 and add an
explicit check for `p == plast` after the loop.
Again, we see modest wins.
From the mozilla-unified repository:
$ perfbdiff -m 3041e4d59df2
! wall 0.035502 comb 0.040000 user 0.040000 sys 0.000000 (best of 100)
! wall 0.030480 comb 0.030000 user 0.030000 sys 0.000000 (best of 100)
$ perfbdiff 0e9928989e9c --alldata --count 100
! wall 4.097394 comb 4.100000 user 4.100000 sys 0.000000 (best of 3)
! wall 3.597798 comb 3.600000 user 3.600000 sys 0.000000 (best of 3)
The 2nd example throws a total of ~3.3GB of data at bdiff. This
change increases the throughput from ~811 MB/s to ~924 MB/s.
#ifndef _HG_BITMANIPULATION_H_
#define _HG_BITMANIPULATION_H_
#include "compat.h"
static inline uint32_t getbe32(const char *c)
{
const unsigned char *d = (const unsigned char *)c;
return ((d[0] << 24) |
(d[1] << 16) |
(d[2] << 8) |
(d[3]));
}
static inline int16_t getbeint16(const char *c)
{
const unsigned char *d = (const unsigned char *)c;
return ((d[0] << 8) |
(d[1]));
}
static inline uint16_t getbeuint16(const char *c)
{
const unsigned char *d = (const unsigned char *)c;
return ((d[0] << 8) |
(d[1]));
}
static inline void putbe32(uint32_t x, char *c)
{
c[0] = (x >> 24) & 0xff;
c[1] = (x >> 16) & 0xff;
c[2] = (x >> 8) & 0xff;
c[3] = (x) & 0xff;
}
static inline double getbefloat64(const char *c)
{
const unsigned char *d = (const unsigned char *)c;
double ret;
int i;
uint64_t t = 0;
for (i = 0; i < 8; i++) {
t = (t<<8) + d[i];
}
memcpy(&ret, &t, sizeof(t));
return ret;
}
#endif