diff options
Diffstat (limited to 'src/htsencoding.c')
-rw-r--r-- | src/htsencoding.c | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c new file mode 100644 index 0000000..46c57a4 --- /dev/null +++ b/src/htsencoding.c @@ -0,0 +1,176 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 3 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + +Please visit our Website: http://www.httrack.com +*/ + +/* ------------------------------------------------------------ */ +/* File: Encoding conversion functions */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#include "htscharset.h" +#include "htsencoding.h" + +/* static int decode_entity(const unsigned int hash, const size_t len); +*/ +#include "htsentities.h" + +/* hexadecimal conversion */ +static int get_hex_value(char c) { + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return (c - 'a' + 10); + else if (c >= 'A' && c <= 'F') + return (c - 'A' + 10); + else + return -1; +} + +/* Numerical Recipes, + see <http://en.wikipedia.org/wiki/Linear_congruential_generator> */ +#define HASH_PRIME ( 1664525 ) +#define HASH_CONST ( 1013904223 ) +#define HASH_ADD(HASH, C) do { \ + (HASH) *= HASH_PRIME; \ + (HASH) += HASH_CONST; \ + (HASH) += (C); \ + } while(0) + +int hts_unescape_entities(const char *src, char *dest, const size_t max) { + size_t i, j, ampStart, ampStartDest; + int uc; + int hex; + unsigned int hash; + for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, + uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) { + /* start of entity */ + if (src[i] == '&') { + ampStart = i; + ampStartDest = j; + hash = 0; + uc = -1; + } + /* inside a potential entity */ + else if (ampStart != (size_t) -1) { + /* &#..; entity */ + if (ampStart + 1 == i && src[ampStart + 1] == '#') { + uc = 0; + hex = 0; + } + /* &#x..; entity */ + else if (ampStart + 2 == i && src[ampStart + 1] == '#' + && src[ampStart + 2] == 'x') { + hex = 1; + } + /* end of entity */ + else if (src[i] == ';') { + size_t len; + + /* decode entity */ + if (uc == -1) { + /* &foo; */ + uc = decode_entity(hash, /*&src[ampStart + 1],*/ + i - ampStart - 1); + /* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */ + if (uc == 160) { + uc = 32; + } + } + + /* end */ + ampStart = (size_t) -1; + + /* success ? */ + if (uc > 0) { + /* write at position */ + len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest); + if (len > 0) { + /* new dest position */ + j = ampStartDest + len; + } + /* do not copy ; */ + continue; + } + } + /* numerical entity */ + else if (uc != -1) { + /* decimal */ + if (!hex) { + if (src[i] >= '0' && src[i] <= '9') { + const int h = src[i] - '0'; + uc *= 10; + uc += h; + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + /* hex */ + else { + const int h = get_hex_value(src[i]); + if (h != -1) { + uc *= 16; + uc += h; + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + } + /* alphanumerical entity */ + else { + /* alphanum and not too far ('ϑ' is the longest) */ + if (i <= ampStart + 10 && + ( + (src[i] >= '0' && src[i] <= '9') + || (src[i] >= 'A' && src[i] <= 'Z') + || (src[i] >= 'a' && src[i] <= 'z') + ) + ) { + /* compute hash */ + HASH_ADD(hash, (unsigned char) src[i]); + } else { + /* abandon */ + ampStart = (size_t) -1; + } + } + } + + /* copy */ + if (j + 1 > max) { + /* overflow */ + return -1; + } + if (src != dest || i != j) { + dest[j] = src[i]; + } + j++; + } + dest[j] = '\0'; + + return 0; +} |