Mercurial > public > mercurial-scm > hg
diff mercurial/cext/pathencode.c @ 32372:df448de7cf3b
parsers: switch to policy importer
# no-check-commit
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sat, 13 Aug 2016 12:23:56 +0900 |
parents | mercurial/pathencode.c@f5607b6253da |
children | 44cb058bc0d3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/cext/pathencode.c Sat Aug 13 12:23:56 2016 +0900 @@ -0,0 +1,765 @@ +/* + pathencode.c - efficient path name encoding + + Copyright 2012 Facebook + + This software may be used and distributed according to the terms of + the GNU General Public License, incorporated herein by reference. +*/ + +/* + * An implementation of the name encoding scheme used by the fncache + * store. The common case is of a path < 120 bytes long, which is + * handled either in a single pass with no allocations or two passes + * with a single allocation. For longer paths, multiple passes are + * required. + */ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#include <assert.h> +#include <ctype.h> +#include <stdlib.h> +#include <string.h> + +#include "util.h" + +/* state machine for the fast path */ +enum path_state { + START, /* first byte of a path component */ + A, /* "AUX" */ + AU, + THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */ + C, /* "CON" or "COMn" */ + CO, + COMLPT, /* "COM" or "LPT" */ + COMLPTn, + L, + LP, + N, + NU, + P, /* "PRN" */ + PR, + LDOT, /* leading '.' */ + DOT, /* '.' in a non-leading position */ + H, /* ".h" */ + HGDI, /* ".hg", ".d", or ".i" */ + SPACE, + DEFAULT /* byte of a path component after the first */ +}; + +/* state machine for dir-encoding */ +enum dir_state { + DDOT, + DH, + DHGDI, + DDEFAULT +}; + +static inline int inset(const uint32_t bitset[], char c) +{ + return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31)); +} + +static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize, + char c) +{ + if (dest) { + assert(*destlen < destsize); + dest[*destlen] = c; + } + (*destlen)++; +} + +static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize, + const void *src, Py_ssize_t len) +{ + if (dest) { + assert(*destlen + len < destsize); + memcpy((void *)&dest[*destlen], src, len); + } + *destlen += len; +} + +static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize, + uint8_t c) +{ + static const char hexdigit[] = "0123456789abcdef"; + + charcopy(dest, destlen, destsize, hexdigit[c >> 4]); + charcopy(dest, destlen, destsize, hexdigit[c & 15]); +} + +/* 3-byte escape: tilde followed by two hex digits */ +static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize, + char c) +{ + charcopy(dest, destlen, destsize, '~'); + hexencode(dest, destlen, destsize, c); +} + +static Py_ssize_t _encodedir(char *dest, size_t destsize, + const char *src, Py_ssize_t len) +{ + enum dir_state state = DDEFAULT; + Py_ssize_t i = 0, destlen = 0; + + while (i < len) { + switch (state) { + case DDOT: + switch (src[i]) { + case 'd': + case 'i': + state = DHGDI; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'h': + state = DH; + charcopy(dest, &destlen, destsize, src[i++]); + break; + default: + state = DDEFAULT; + break; + } + break; + case DH: + if (src[i] == 'g') { + state = DHGDI; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DDEFAULT; + break; + case DHGDI: + if (src[i] == '/') { + memcopy(dest, &destlen, destsize, ".hg", 3); + charcopy(dest, &destlen, destsize, src[i++]); + } + state = DDEFAULT; + break; + case DDEFAULT: + if (src[i] == '.') + state = DDOT; + charcopy(dest, &destlen, destsize, src[i++]); + break; + } + } + + return destlen; +} + +PyObject *encodedir(PyObject *self, PyObject *args) +{ + Py_ssize_t len, newlen; + PyObject *pathobj, *newobj; + char *path; + + if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj)) + return NULL; + + if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) { + PyErr_SetString(PyExc_TypeError, "expected a string"); + return NULL; + } + + newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1; + + if (newlen == len + 1) { + Py_INCREF(pathobj); + return pathobj; + } + + newobj = PyBytes_FromStringAndSize(NULL, newlen); + + if (newobj) { + assert(PyBytes_Check(newobj)); + Py_SIZE(newobj)--; + _encodedir(PyBytes_AS_STRING(newobj), newlen, path, + len + 1); + } + + return newobj; +} + +static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8], + char *dest, Py_ssize_t destlen, size_t destsize, + const char *src, Py_ssize_t len, + int encodedir) +{ + enum path_state state = START; + Py_ssize_t i = 0; + + /* + * Python strings end with a zero byte, which we use as a + * terminal token as they are not valid inside path names. + */ + + while (i < len) { + switch (state) { + case START: + switch (src[i]) { + case '/': + charcopy(dest, &destlen, destsize, src[i++]); + break; + case '.': + state = LDOT; + escape3(dest, &destlen, destsize, src[i++]); + break; + case ' ': + state = DEFAULT; + escape3(dest, &destlen, destsize, src[i++]); + break; + case 'a': + state = A; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'c': + state = C; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'l': + state = L; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'n': + state = N; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'p': + state = P; + charcopy(dest, &destlen, destsize, src[i++]); + break; + default: + state = DEFAULT; + break; + } + break; + case A: + if (src[i] == 'u') { + state = AU; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case AU: + if (src[i] == 'x') { + state = THIRD; + i++; + } + else state = DEFAULT; + break; + case THIRD: + state = DEFAULT; + switch (src[i]) { + case '.': + case '/': + case '\0': + escape3(dest, &destlen, destsize, src[i - 1]); + break; + default: + i--; + break; + } + break; + case C: + if (src[i] == 'o') { + state = CO; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case CO: + if (src[i] == 'm') { + state = COMLPT; + i++; + } + else if (src[i] == 'n') { + state = THIRD; + i++; + } + else state = DEFAULT; + break; + case COMLPT: + switch (src[i]) { + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + state = COMLPTn; + i++; + break; + default: + state = DEFAULT; + charcopy(dest, &destlen, destsize, src[i - 1]); + break; + } + break; + case COMLPTn: + state = DEFAULT; + switch (src[i]) { + case '.': + case '/': + case '\0': + escape3(dest, &destlen, destsize, src[i - 2]); + charcopy(dest, &destlen, destsize, src[i - 1]); + break; + default: + memcopy(dest, &destlen, destsize, + &src[i - 2], 2); + break; + } + break; + case L: + if (src[i] == 'p') { + state = LP; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case LP: + if (src[i] == 't') { + state = COMLPT; + i++; + } + else state = DEFAULT; + break; + case N: + if (src[i] == 'u') { + state = NU; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case NU: + if (src[i] == 'l') { + state = THIRD; + i++; + } + else state = DEFAULT; + break; + case P: + if (src[i] == 'r') { + state = PR; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case PR: + if (src[i] == 'n') { + state = THIRD; + i++; + } + else state = DEFAULT; + break; + case LDOT: + switch (src[i]) { + case 'd': + case 'i': + state = HGDI; + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'h': + state = H; + charcopy(dest, &destlen, destsize, src[i++]); + break; + default: + state = DEFAULT; + break; + } + break; + case DOT: + switch (src[i]) { + case '/': + case '\0': + state = START; + memcopy(dest, &destlen, destsize, "~2e", 3); + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'd': + case 'i': + state = HGDI; + charcopy(dest, &destlen, destsize, '.'); + charcopy(dest, &destlen, destsize, src[i++]); + break; + case 'h': + state = H; + memcopy(dest, &destlen, destsize, ".h", 2); + i++; + break; + default: + state = DEFAULT; + charcopy(dest, &destlen, destsize, '.'); + break; + } + break; + case H: + if (src[i] == 'g') { + state = HGDI; + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case HGDI: + if (src[i] == '/') { + state = START; + if (encodedir) + memcopy(dest, &destlen, destsize, ".hg", + 3); + charcopy(dest, &destlen, destsize, src[i++]); + } + else state = DEFAULT; + break; + case SPACE: + switch (src[i]) { + case '/': + case '\0': + state = START; + memcopy(dest, &destlen, destsize, "~20", 3); + charcopy(dest, &destlen, destsize, src[i++]); + break; + default: + state = DEFAULT; + charcopy(dest, &destlen, destsize, ' '); + break; + } + break; + case DEFAULT: + while (inset(onebyte, src[i])) { + charcopy(dest, &destlen, destsize, src[i++]); + if (i == len) + goto done; + } + switch (src[i]) { + case '.': + state = DOT; + i++; + break; + case ' ': + state = SPACE; + i++; + break; + case '/': + state = START; + charcopy(dest, &destlen, destsize, '/'); + i++; + break; + default: + if (inset(onebyte, src[i])) { + do { + charcopy(dest, &destlen, + destsize, src[i++]); + } while (i < len && + inset(onebyte, src[i])); + } + else if (inset(twobytes, src[i])) { + char c = src[i++]; + charcopy(dest, &destlen, destsize, '_'); + charcopy(dest, &destlen, destsize, + c == '_' ? '_' : c + 32); + } + else + escape3(dest, &destlen, destsize, + src[i++]); + break; + } + break; + } + } +done: + return destlen; +} + +static Py_ssize_t basicencode(char *dest, size_t destsize, + const char *src, Py_ssize_t len) +{ + static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe }; + + static const uint32_t onebyte[8] = { + 1, 0x2bff3bfa, 0x68000001, 0x2fffffff, + }; + + Py_ssize_t destlen = 0; + + return _encode(twobytes, onebyte, dest, destlen, destsize, + src, len, 1); +} + +static const Py_ssize_t maxstorepathlen = 120; + +static Py_ssize_t _lowerencode(char *dest, size_t destsize, + const char *src, Py_ssize_t len) +{ + static const uint32_t onebyte[8] = { + 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff + }; + + static const uint32_t lower[8] = { 0, 0, 0x7fffffe }; + + Py_ssize_t i, destlen = 0; + + for (i = 0; i < len; i++) { + if (inset(onebyte, src[i])) + charcopy(dest, &destlen, destsize, src[i]); + else if (inset(lower, src[i])) + charcopy(dest, &destlen, destsize, src[i] + 32); + else + escape3(dest, &destlen, destsize, src[i]); + } + + return destlen; +} + +PyObject *lowerencode(PyObject *self, PyObject *args) +{ + char *path; + Py_ssize_t len, newlen; + PyObject *ret; + + if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len)) + return NULL; + + newlen = _lowerencode(NULL, 0, path, len); + ret = PyBytes_FromStringAndSize(NULL, newlen); + if (ret) + _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len); + + return ret; +} + +/* See store.py:_auxencode for a description. */ +static Py_ssize_t auxencode(char *dest, size_t destsize, + const char *src, Py_ssize_t len) +{ + static const uint32_t twobytes[8]; + + static const uint32_t onebyte[8] = { + ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U, + }; + + return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0); +} + +static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20]) +{ + static const Py_ssize_t dirprefixlen = 8; + static const Py_ssize_t maxshortdirslen = 68; + char *dest; + PyObject *ret; + + Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1; + Py_ssize_t destsize, destlen = 0, slop, used; + + while (lastslash >= 0 && src[lastslash] != '/') { + if (src[lastslash] == '.' && lastdot == -1) + lastdot = lastslash; + lastslash--; + } + +#if 0 + /* All paths should end in a suffix of ".i" or ".d". + Unfortunately, the file names in test-hybridencode.py + violate this rule. */ + if (lastdot != len - 3) { + PyErr_SetString(PyExc_ValueError, + "suffix missing or wrong length"); + return NULL; + } +#endif + + /* If src contains a suffix, we will append it to the end of + the new string, so make room. */ + destsize = 120; + if (lastdot >= 0) + destsize += len - lastdot - 1; + + ret = PyBytes_FromStringAndSize(NULL, destsize); + if (ret == NULL) + return NULL; + + dest = PyBytes_AS_STRING(ret); + memcopy(dest, &destlen, destsize, "dh/", 3); + + /* Copy up to dirprefixlen bytes of each path component, up to + a limit of maxshortdirslen bytes. */ + for (i = d = p = 0; i < lastslash; i++, p++) { + if (src[i] == '/') { + char d = dest[destlen - 1]; + /* After truncation, a directory name may end + in a space or dot, which are unportable. */ + if (d == '.' || d == ' ') + dest[destlen - 1] = '_'; + /* The + 3 is to account for "dh/" in the beginning */ + if (destlen > maxshortdirslen + 3) + break; + charcopy(dest, &destlen, destsize, src[i]); + p = -1; + } + else if (p < dirprefixlen) + charcopy(dest, &destlen, destsize, src[i]); + } + + /* Rewind to just before the last slash copied. */ + if (destlen > maxshortdirslen + 3) + do { + destlen--; + } while (destlen > 0 && dest[destlen] != '/'); + + if (destlen > 3) { + if (lastslash > 0) { + char d = dest[destlen - 1]; + /* The last directory component may be + truncated, so make it safe. */ + if (d == '.' || d == ' ') + dest[destlen - 1] = '_'; + } + + charcopy(dest, &destlen, destsize, '/'); + } + + /* Add a prefix of the original file's name. Its length + depends on the number of bytes left after accounting for + hash and suffix. */ + used = destlen + 40; + if (lastdot >= 0) + used += len - lastdot - 1; + slop = maxstorepathlen - used; + if (slop > 0) { + Py_ssize_t basenamelen = + lastslash >= 0 ? len - lastslash - 2 : len - 1; + + if (basenamelen > slop) + basenamelen = slop; + if (basenamelen > 0) + memcopy(dest, &destlen, destsize, &src[lastslash + 1], + basenamelen); + } + + /* Add hash and suffix. */ + for (i = 0; i < 20; i++) + hexencode(dest, &destlen, destsize, sha[i]); + + if (lastdot >= 0) + memcopy(dest, &destlen, destsize, &src[lastdot], + len - lastdot - 1); + + assert(PyBytes_Check(ret)); + Py_SIZE(ret) = destlen; + + return ret; +} + +/* + * Avoiding a trip through Python would improve performance by 50%, + * but we don't encounter enough long names to be worth the code. + */ +static int sha1hash(char hash[20], const char *str, Py_ssize_t len) +{ + static PyObject *shafunc; + PyObject *shaobj, *hashobj; + + if (shafunc == NULL) { + PyObject *hashlib, *name = PyBytes_FromString("hashlib"); + + if (name == NULL) + return -1; + + hashlib = PyImport_Import(name); + Py_DECREF(name); + + if (hashlib == NULL) { + PyErr_SetString(PyExc_ImportError, "hashlib"); + return -1; + } + shafunc = PyObject_GetAttrString(hashlib, "sha1"); + Py_DECREF(hashlib); + + if (shafunc == NULL) { + PyErr_SetString(PyExc_AttributeError, + "module 'hashlib' has no " + "attribute 'sha1'"); + return -1; + } + } + + shaobj = PyObject_CallFunction(shafunc, "s#", str, len); + + if (shaobj == NULL) + return -1; + + hashobj = PyObject_CallMethod(shaobj, "digest", ""); + Py_DECREF(shaobj); + if (hashobj == NULL) + return -1; + + if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) { + PyErr_SetString(PyExc_TypeError, + "result of digest is not a 20-byte hash"); + Py_DECREF(hashobj); + return -1; + } + + memcpy(hash, PyBytes_AS_STRING(hashobj), 20); + Py_DECREF(hashobj); + return 0; +} + +#define MAXENCODE 4096 * 4 + +static PyObject *hashencode(const char *src, Py_ssize_t len) +{ + char dired[MAXENCODE]; + char lowered[MAXENCODE]; + char auxed[MAXENCODE]; + Py_ssize_t dirlen, lowerlen, auxlen, baselen; + char sha[20]; + + baselen = (len - 5) * 3; + if (baselen >= MAXENCODE) { + PyErr_SetString(PyExc_ValueError, "string too long"); + return NULL; + } + + dirlen = _encodedir(dired, baselen, src, len); + if (sha1hash(sha, dired, dirlen - 1) == -1) + return NULL; + lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5); + auxlen = auxencode(auxed, baselen, lowered, lowerlen); + return hashmangle(auxed, auxlen, sha); +} + +PyObject *pathencode(PyObject *self, PyObject *args) +{ + Py_ssize_t len, newlen; + PyObject *pathobj, *newobj; + char *path; + + if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj)) + return NULL; + + if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) { + PyErr_SetString(PyExc_TypeError, "expected a string"); + return NULL; + } + + if (len > maxstorepathlen) + newlen = maxstorepathlen + 2; + else + newlen = len ? basicencode(NULL, 0, path, len + 1) : 1; + + if (newlen <= maxstorepathlen + 1) { + if (newlen == len + 1) { + Py_INCREF(pathobj); + return pathobj; + } + + newobj = PyBytes_FromStringAndSize(NULL, newlen); + + if (newobj) { + assert(PyBytes_Check(newobj)); + Py_SIZE(newobj)--; + basicencode(PyBytes_AS_STRING(newobj), newlen, path, + len + 1); + } + } + else + newobj = hashencode(path, len + 1); + + return newobj; +}