bdiff: don't check border condition in loop
This is pretty much a copy of d500ddae7494, just to a different loop.
The condition `p == plast` (`plast == a + len - 1`) was only true on
the final iteration of the loop. So it was wasteful to check for it
on every iteration. We decrease the iteration count by 1 and add an
explicit check for `p == plast` after the loop.
Again, we see modest wins.
From the mozilla-unified repository:
$ perfbdiff -m 3041e4d59df2
! wall 0.035502 comb 0.040000 user 0.040000 sys 0.000000 (best of 100)
! wall 0.030480 comb 0.030000 user 0.030000 sys 0.000000 (best of 100)
$ perfbdiff 0e9928989e9c --alldata --count 100
! wall 4.097394 comb 4.100000 user 4.100000 sys 0.000000 (best of 3)
! wall 3.597798 comb 3.600000 user 3.600000 sys 0.000000 (best of 3)
The 2nd example throws a total of ~3.3GB of data at bdiff. This
change increases the throughput from ~811 MB/s to ~924 MB/s.
#ifndef _HG_COMPAT_H_
#define _HG_COMPAT_H_
#ifdef _WIN32
#ifdef _MSC_VER
/* msvc 6.0 has problems */
#define inline __inline
#if defined(_WIN64)
typedef __int64 ssize_t;
#else
typedef int ssize_t;
#endif
typedef signed char int8_t;
typedef short int16_t;
typedef long int32_t;
typedef __int64 int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned long uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif
#else
/* not windows */
#include <sys/types.h>
#if defined __BEOS__ && !defined __HAIKU__
#include <ByteOrder.h>
#else
#include <arpa/inet.h>
#endif
#include <inttypes.h>
#endif
#if defined __hpux || defined __SUNPRO_C || defined _AIX
#define inline
#endif
#ifdef __linux
#define inline __inline
#endif
#endif