diff -r 151cc3b3d799 -r df448de7cf3b mercurial/pathencode.c --- a/mercurial/pathencode.c Sat Aug 13 12:18:58 2016 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,765 +0,0 @@ -/* - pathencode.c - efficient path name encoding - - Copyright 2012 Facebook - - This software may be used and distributed according to the terms of - the GNU General Public License, incorporated herein by reference. -*/ - -/* - * An implementation of the name encoding scheme used by the fncache - * store. The common case is of a path < 120 bytes long, which is - * handled either in a single pass with no allocations or two passes - * with a single allocation. For longer paths, multiple passes are - * required. - */ - -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include -#include - -#include "util.h" - -/* state machine for the fast path */ -enum path_state { - START, /* first byte of a path component */ - A, /* "AUX" */ - AU, - THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ - C, /* "CON" or "COMn" */ - CO, - COMLPT, /* "COM" or "LPT" */ - COMLPTn, - L, - LP, - N, - NU, - P, /* "PRN" */ - PR, - LDOT, /* leading '.' */ - DOT, /* '.' in a non-leading position */ - H, /* ".h" */ - HGDI, /* ".hg", ".d", or ".i" */ - SPACE, - DEFAULT /* byte of a path component after the first */ -}; - -/* state machine for dir-encoding */ -enum dir_state { - DDOT, - DH, - DHGDI, - DDEFAULT -}; - -static inline int inset(const uint32_t bitset[], char c) -{ - return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31)); -} - -static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, - char c) -{ - if (dest) { - assert(*destlen < destsize); - dest[*destlen] = c; - } - (*destlen)++; -} - -static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize, - const void *src, Py_ssize_t len) -{ - if (dest) { - assert(*destlen + len < destsize); - memcpy((void *)&dest[*destlen], src, len); - } - *destlen += len; -} - -static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize, - uint8_t c) -{ - static const char hexdigit[] = "0123456789abcdef"; - - charcopy(dest, destlen, destsize, hexdigit[c >> 4]); - charcopy(dest, destlen, destsize, hexdigit[c & 15]); -} - -/* 3-byte escape: tilde followed by two hex digits */ -static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize, - char c) -{ - charcopy(dest, destlen, destsize, '~'); - hexencode(dest, destlen, destsize, c); -} - -static Py_ssize_t _encodedir(char *dest, size_t destsize, - const char *src, Py_ssize_t len) -{ - enum dir_state state = DDEFAULT; - Py_ssize_t i = 0, destlen = 0; - - while (i < len) { - switch (state) { - case DDOT: - switch (src[i]) { - case 'd': - case 'i': - state = DHGDI; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'h': - state = DH; - charcopy(dest, &destlen, destsize, src[i++]); - break; - default: - state = DDEFAULT; - break; - } - break; - case DH: - if (src[i] == 'g') { - state = DHGDI; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DDEFAULT; - break; - case DHGDI: - if (src[i] == '/') { - memcopy(dest, &destlen, destsize, ".hg", 3); - charcopy(dest, &destlen, destsize, src[i++]); - } - state = DDEFAULT; - break; - case DDEFAULT: - if (src[i] == '.') - state = DDOT; - charcopy(dest, &destlen, destsize, src[i++]); - break; - } - } - - return destlen; -} - -PyObject *encodedir(PyObject *self, PyObject *args) -{ - Py_ssize_t len, newlen; - PyObject *pathobj, *newobj; - char *path; - - if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj)) - return NULL; - - if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) { - PyErr_SetString(PyExc_TypeError, "expected a string"); - return NULL; - } - - newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1; - - if (newlen == len + 1) { - Py_INCREF(pathobj); - return pathobj; - } - - newobj = PyBytes_FromStringAndSize(NULL, newlen); - - if (newobj) { - assert(PyBytes_Check(newobj)); - Py_SIZE(newobj)--; - _encodedir(PyBytes_AS_STRING(newobj), newlen, path, - len + 1); - } - - return newobj; -} - -static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8], - char *dest, Py_ssize_t destlen, size_t destsize, - const char *src, Py_ssize_t len, - int encodedir) -{ - enum path_state state = START; - Py_ssize_t i = 0; - - /* - * Python strings end with a zero byte, which we use as a - * terminal token as they are not valid inside path names. - */ - - while (i < len) { - switch (state) { - case START: - switch (src[i]) { - case '/': - charcopy(dest, &destlen, destsize, src[i++]); - break; - case '.': - state = LDOT; - escape3(dest, &destlen, destsize, src[i++]); - break; - case ' ': - state = DEFAULT; - escape3(dest, &destlen, destsize, src[i++]); - break; - case 'a': - state = A; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'c': - state = C; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'l': - state = L; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'n': - state = N; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'p': - state = P; - charcopy(dest, &destlen, destsize, src[i++]); - break; - default: - state = DEFAULT; - break; - } - break; - case A: - if (src[i] == 'u') { - state = AU; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case AU: - if (src[i] == 'x') { - state = THIRD; - i++; - } - else state = DEFAULT; - break; - case THIRD: - state = DEFAULT; - switch (src[i]) { - case '.': - case '/': - case '\0': - escape3(dest, &destlen, destsize, src[i - 1]); - break; - default: - i--; - break; - } - break; - case C: - if (src[i] == 'o') { - state = CO; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case CO: - if (src[i] == 'm') { - state = COMLPT; - i++; - } - else if (src[i] == 'n') { - state = THIRD; - i++; - } - else state = DEFAULT; - break; - case COMLPT: - switch (src[i]) { - case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': - state = COMLPTn; - i++; - break; - default: - state = DEFAULT; - charcopy(dest, &destlen, destsize, src[i - 1]); - break; - } - break; - case COMLPTn: - state = DEFAULT; - switch (src[i]) { - case '.': - case '/': - case '\0': - escape3(dest, &destlen, destsize, src[i - 2]); - charcopy(dest, &destlen, destsize, src[i - 1]); - break; - default: - memcopy(dest, &destlen, destsize, - &src[i - 2], 2); - break; - } - break; - case L: - if (src[i] == 'p') { - state = LP; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case LP: - if (src[i] == 't') { - state = COMLPT; - i++; - } - else state = DEFAULT; - break; - case N: - if (src[i] == 'u') { - state = NU; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case NU: - if (src[i] == 'l') { - state = THIRD; - i++; - } - else state = DEFAULT; - break; - case P: - if (src[i] == 'r') { - state = PR; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case PR: - if (src[i] == 'n') { - state = THIRD; - i++; - } - else state = DEFAULT; - break; - case LDOT: - switch (src[i]) { - case 'd': - case 'i': - state = HGDI; - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'h': - state = H; - charcopy(dest, &destlen, destsize, src[i++]); - break; - default: - state = DEFAULT; - break; - } - break; - case DOT: - switch (src[i]) { - case '/': - case '\0': - state = START; - memcopy(dest, &destlen, destsize, "~2e", 3); - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'd': - case 'i': - state = HGDI; - charcopy(dest, &destlen, destsize, '.'); - charcopy(dest, &destlen, destsize, src[i++]); - break; - case 'h': - state = H; - memcopy(dest, &destlen, destsize, ".h", 2); - i++; - break; - default: - state = DEFAULT; - charcopy(dest, &destlen, destsize, '.'); - break; - } - break; - case H: - if (src[i] == 'g') { - state = HGDI; - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case HGDI: - if (src[i] == '/') { - state = START; - if (encodedir) - memcopy(dest, &destlen, destsize, ".hg", - 3); - charcopy(dest, &destlen, destsize, src[i++]); - } - else state = DEFAULT; - break; - case SPACE: - switch (src[i]) { - case '/': - case '\0': - state = START; - memcopy(dest, &destlen, destsize, "~20", 3); - charcopy(dest, &destlen, destsize, src[i++]); - break; - default: - state = DEFAULT; - charcopy(dest, &destlen, destsize, ' '); - break; - } - break; - case DEFAULT: - while (inset(onebyte, src[i])) { - charcopy(dest, &destlen, destsize, src[i++]); - if (i == len) - goto done; - } - switch (src[i]) { - case '.': - state = DOT; - i++; - break; - case ' ': - state = SPACE; - i++; - break; - case '/': - state = START; - charcopy(dest, &destlen, destsize, '/'); - i++; - break; - default: - if (inset(onebyte, src[i])) { - do { - charcopy(dest, &destlen, - destsize, src[i++]); - } while (i < len && - inset(onebyte, src[i])); - } - else if (inset(twobytes, src[i])) { - char c = src[i++]; - charcopy(dest, &destlen, destsize, '_'); - charcopy(dest, &destlen, destsize, - c == '_' ? '_' : c + 32); - } - else - escape3(dest, &destlen, destsize, - src[i++]); - break; - } - break; - } - } -done: - return destlen; -} - -static Py_ssize_t basicencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) -{ - static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe }; - - static const uint32_t onebyte[8] = { - 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, - }; - - Py_ssize_t destlen = 0; - - return _encode(twobytes, onebyte, dest, destlen, destsize, - src, len, 1); -} - -static const Py_ssize_t maxstorepathlen = 120; - -static Py_ssize_t _lowerencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) -{ - static const uint32_t onebyte[8] = { - 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff - }; - - static const uint32_t lower[8] = { 0, 0, 0x7fffffe }; - - Py_ssize_t i, destlen = 0; - - for (i = 0; i < len; i++) { - if (inset(onebyte, src[i])) - charcopy(dest, &destlen, destsize, src[i]); - else if (inset(lower, src[i])) - charcopy(dest, &destlen, destsize, src[i] + 32); - else - escape3(dest, &destlen, destsize, src[i]); - } - - return destlen; -} - -PyObject *lowerencode(PyObject *self, PyObject *args) -{ - char *path; - Py_ssize_t len, newlen; - PyObject *ret; - - if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len)) - return NULL; - - newlen = _lowerencode(NULL, 0, path, len); - ret = PyBytes_FromStringAndSize(NULL, newlen); - if (ret) - _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len); - - return ret; -} - -/* See store.py:_auxencode for a description. */ -static Py_ssize_t auxencode(char *dest, size_t destsize, - const char *src, Py_ssize_t len) -{ - static const uint32_t twobytes[8]; - - static const uint32_t onebyte[8] = { - ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U, - }; - - return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0); -} - -static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20]) -{ - static const Py_ssize_t dirprefixlen = 8; - static const Py_ssize_t maxshortdirslen = 68; - char *dest; - PyObject *ret; - - Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1; - Py_ssize_t destsize, destlen = 0, slop, used; - - while (lastslash >= 0 && src[lastslash] != '/') { - if (src[lastslash] == '.' && lastdot == -1) - lastdot = lastslash; - lastslash--; - } - -#if 0 - /* All paths should end in a suffix of ".i" or ".d". - Unfortunately, the file names in test-hybridencode.py - violate this rule. */ - if (lastdot != len - 3) { - PyErr_SetString(PyExc_ValueError, - "suffix missing or wrong length"); - return NULL; - } -#endif - - /* If src contains a suffix, we will append it to the end of - the new string, so make room. */ - destsize = 120; - if (lastdot >= 0) - destsize += len - lastdot - 1; - - ret = PyBytes_FromStringAndSize(NULL, destsize); - if (ret == NULL) - return NULL; - - dest = PyBytes_AS_STRING(ret); - memcopy(dest, &destlen, destsize, "dh/", 3); - - /* Copy up to dirprefixlen bytes of each path component, up to - a limit of maxshortdirslen bytes. */ - for (i = d = p = 0; i < lastslash; i++, p++) { - if (src[i] == '/') { - char d = dest[destlen - 1]; - /* After truncation, a directory name may end - in a space or dot, which are unportable. */ - if (d == '.' || d == ' ') - dest[destlen - 1] = '_'; - /* The + 3 is to account for "dh/" in the beginning */ - if (destlen > maxshortdirslen + 3) - break; - charcopy(dest, &destlen, destsize, src[i]); - p = -1; - } - else if (p < dirprefixlen) - charcopy(dest, &destlen, destsize, src[i]); - } - - /* Rewind to just before the last slash copied. */ - if (destlen > maxshortdirslen + 3) - do { - destlen--; - } while (destlen > 0 && dest[destlen] != '/'); - - if (destlen > 3) { - if (lastslash > 0) { - char d = dest[destlen - 1]; - /* The last directory component may be - truncated, so make it safe. */ - if (d == '.' || d == ' ') - dest[destlen - 1] = '_'; - } - - charcopy(dest, &destlen, destsize, '/'); - } - - /* Add a prefix of the original file's name. Its length - depends on the number of bytes left after accounting for - hash and suffix. */ - used = destlen + 40; - if (lastdot >= 0) - used += len - lastdot - 1; - slop = maxstorepathlen - used; - if (slop > 0) { - Py_ssize_t basenamelen = - lastslash >= 0 ? len - lastslash - 2 : len - 1; - - if (basenamelen > slop) - basenamelen = slop; - if (basenamelen > 0) - memcopy(dest, &destlen, destsize, &src[lastslash + 1], - basenamelen); - } - - /* Add hash and suffix. */ - for (i = 0; i < 20; i++) - hexencode(dest, &destlen, destsize, sha[i]); - - if (lastdot >= 0) - memcopy(dest, &destlen, destsize, &src[lastdot], - len - lastdot - 1); - - assert(PyBytes_Check(ret)); - Py_SIZE(ret) = destlen; - - return ret; -} - -/* - * Avoiding a trip through Python would improve performance by 50%, - * but we don't encounter enough long names to be worth the code. - */ -static int sha1hash(char hash[20], const char *str, Py_ssize_t len) -{ - static PyObject *shafunc; - PyObject *shaobj, *hashobj; - - if (shafunc == NULL) { - PyObject *hashlib, *name = PyBytes_FromString("hashlib"); - - if (name == NULL) - return -1; - - hashlib = PyImport_Import(name); - Py_DECREF(name); - - if (hashlib == NULL) { - PyErr_SetString(PyExc_ImportError, "hashlib"); - return -1; - } - shafunc = PyObject_GetAttrString(hashlib, "sha1"); - Py_DECREF(hashlib); - - if (shafunc == NULL) { - PyErr_SetString(PyExc_AttributeError, - "module 'hashlib' has no " - "attribute 'sha1'"); - return -1; - } - } - - shaobj = PyObject_CallFunction(shafunc, "s#", str, len); - - if (shaobj == NULL) - return -1; - - hashobj = PyObject_CallMethod(shaobj, "digest", ""); - Py_DECREF(shaobj); - if (hashobj == NULL) - return -1; - - if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) { - PyErr_SetString(PyExc_TypeError, - "result of digest is not a 20-byte hash"); - Py_DECREF(hashobj); - return -1; - } - - memcpy(hash, PyBytes_AS_STRING(hashobj), 20); - Py_DECREF(hashobj); - return 0; -} - -#define MAXENCODE 4096 * 4 - -static PyObject *hashencode(const char *src, Py_ssize_t len) -{ - char dired[MAXENCODE]; - char lowered[MAXENCODE]; - char auxed[MAXENCODE]; - Py_ssize_t dirlen, lowerlen, auxlen, baselen; - char sha[20]; - - baselen = (len - 5) * 3; - if (baselen >= MAXENCODE) { - PyErr_SetString(PyExc_ValueError, "string too long"); - return NULL; - } - - dirlen = _encodedir(dired, baselen, src, len); - if (sha1hash(sha, dired, dirlen - 1) == -1) - return NULL; - lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5); - auxlen = auxencode(auxed, baselen, lowered, lowerlen); - return hashmangle(auxed, auxlen, sha); -} - -PyObject *pathencode(PyObject *self, PyObject *args) -{ - Py_ssize_t len, newlen; - PyObject *pathobj, *newobj; - char *path; - - if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj)) - return NULL; - - if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) { - PyErr_SetString(PyExc_TypeError, "expected a string"); - return NULL; - } - - if (len > maxstorepathlen) - newlen = maxstorepathlen + 2; - else - newlen = len ? basicencode(NULL, 0, path, len + 1) : 1; - - if (newlen <= maxstorepathlen + 1) { - if (newlen == len + 1) { - Py_INCREF(pathobj); - return pathobj; - } - - newobj = PyBytes_FromStringAndSize(NULL, newlen); - - if (newobj) { - assert(PyBytes_Check(newobj)); - Py_SIZE(newobj)--; - basicencode(PyBytes_AS_STRING(newobj), newlen, path, - len + 1); - } - } - else - newobj = hashencode(path, len + 1); - - return newobj; -}