diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-30 19:04:51 +0000 |
commit | 850f165f4ac90a6e6687c392ddfdd0c6a05b3fe5 (patch) | |
tree | 7cac2f01468639c4ab63fe523c17d7638e8cd2ac /src/htslib.c | |
parent | 01af2a5e73f53ebf8a092e4bda77cd1326c1da11 (diff) |
Added hts_unescape_entities(), a rewrite of the HTML entities decoder.
Fixed HTML entities decoding which was done before charset decoding.
Diffstat (limited to 'src/htslib.c')
-rw-r--r-- | src/htslib.c | 256 |
1 files changed, 7 insertions, 249 deletions
diff --git a/src/htslib.c b/src/htslib.c index fe9f240..8b53b88 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -54,6 +54,7 @@ Please visit our Website: http://www.httrack.com #include "htsmd5.h" #include "htsmodules.h" #include "htscharset.h" +#include "htsencoding.h" #ifdef _WIN32 #ifndef _WIN32_WCE @@ -3737,255 +3738,6 @@ void code64(unsigned char *a, int size_a, unsigned char *b, int crlf) { *b++ = '\0'; } -// remplacer " par " etc.. -// buffer MAX 1Ko -#define strcmpbeg(a, b) strncmp(a, b, strlen(b)) -HTSEXT_API void unescape_amp(char *s) { - while(*s) { - if (*s == '&') { - char *end = strchr(s, ';'); - - if (end && (((int) (end - s)) <= 8)) { - unsigned char c = 0; - - // http://www.w3.org/TR/xhtml-modularization/dtd_module_defs.html - if (strcmpbeg(s, "&#") == 0) { - int num = 0; - - if ((s[2] == 'x') || (s[2] == 'X')) { - if (sscanf(s + 3, "%x", &num) == 1 && num <= 0xff) { - c = (unsigned char) num; - } - } else { - if (sscanf(s + 2, "%d", &num) == 1 && num <= 0xff) { - c = (unsigned char) num; - } - } - } else if (strcmpbeg(s, " ") == 0) - c = 32; // hack - c=160; - else if (strcmpbeg(s, "¡") == 0) - c = 161; - else if (strcmpbeg(s, "¢") == 0) - c = 162; - else if (strcmpbeg(s, "£") == 0) - c = 163; - else if (strcmpbeg(s, "¤") == 0) - c = 164; - else if (strcmpbeg(s, "¥") == 0) - c = 165; - else if (strcmpbeg(s, "¦") == 0) - c = 166; - else if (strcmpbeg(s, "§") == 0) - c = 167; - else if (strcmpbeg(s, "¨") == 0) - c = 168; - else if (strcmpbeg(s, "©") == 0) - c = 169; - else if (strcmpbeg(s, "ª") == 0) - c = 170; - //else if (strcmpbeg(s, "«")==0) - // c=171; - else if (strcmpbeg(s, "¬") == 0) - c = 172; - //else if (strcmpbeg(s, "­")==0) - // c=173; - else if (strcmpbeg(s, "®") == 0) - c = 174; - else if (strcmpbeg(s, "¯") == 0) - c = 175; - else if (strcmpbeg(s, "°") == 0) - c = 176; - else if (strcmpbeg(s, "±") == 0) - c = 177; - else if (strcmpbeg(s, "²") == 0) - c = 178; - else if (strcmpbeg(s, "³") == 0) - c = 179; - else if (strcmpbeg(s, "´") == 0) - c = 180; - else if (strcmpbeg(s, "µ") == 0) - c = 181; - else if (strcmpbeg(s, "¶") == 0) - c = 182; - else if (strcmpbeg(s, "·") == 0) - c = 183; - else if (strcmpbeg(s, "¸") == 0) - c = 184; - else if (strcmpbeg(s, "¹") == 0) - c = 185; - else if (strcmpbeg(s, "º") == 0) - c = 186; - //else if (strcmpbeg(s, "»")==0) - // c=187; - else if (strcmpbeg(s, "¼") == 0) - c = 188; - else if (strcmpbeg(s, "½") == 0) - c = 189; - else if (strcmpbeg(s, "¾") == 0) - c = 190; - else if (strcmpbeg(s, "¿") == 0) - c = 191; - else if (strcmpbeg(s, "À") == 0) - c = 192; - else if (strcmpbeg(s, "Á") == 0) - c = 193; - else if (strcmpbeg(s, "Â") == 0) - c = 194; - else if (strcmpbeg(s, "Ã") == 0) - c = 195; - else if (strcmpbeg(s, "Ä") == 0) - c = 196; - else if (strcmpbeg(s, "Å") == 0) - c = 197; - else if (strcmpbeg(s, "Æ") == 0) - c = 198; - else if (strcmpbeg(s, "Ç") == 0) - c = 199; - else if (strcmpbeg(s, "È") == 0) - c = 200; - else if (strcmpbeg(s, "É") == 0) - c = 201; - else if (strcmpbeg(s, "Ê") == 0) - c = 202; - else if (strcmpbeg(s, "Ë") == 0) - c = 203; - else if (strcmpbeg(s, "Ì") == 0) - c = 204; - else if (strcmpbeg(s, "Í") == 0) - c = 205; - else if (strcmpbeg(s, "Î") == 0) - c = 206; - else if (strcmpbeg(s, "Ï") == 0) - c = 207; - else if (strcmpbeg(s, "Ð") == 0) - c = 208; - else if (strcmpbeg(s, "Ñ") == 0) - c = 209; - else if (strcmpbeg(s, "Ò") == 0) - c = 210; - else if (strcmpbeg(s, "Ó") == 0) - c = 211; - else if (strcmpbeg(s, "Ô") == 0) - c = 212; - else if (strcmpbeg(s, "Õ") == 0) - c = 213; - else if (strcmpbeg(s, "Ö") == 0) - c = 214; - else if (strcmpbeg(s, "×") == 0) - c = 215; - else if (strcmpbeg(s, "Ø") == 0) - c = 216; - else if (strcmpbeg(s, "Ù") == 0) - c = 217; - else if (strcmpbeg(s, "Ú") == 0) - c = 218; - else if (strcmpbeg(s, "Û") == 0) - c = 219; - else if (strcmpbeg(s, "Ü") == 0) - c = 220; - else if (strcmpbeg(s, "Ý") == 0) - c = 221; - else if (strcmpbeg(s, "Þ") == 0) - c = 222; - else if (strcmpbeg(s, "ß") == 0) - c = 223; - else if (strcmpbeg(s, "à") == 0) - c = 224; - else if (strcmpbeg(s, "á") == 0) - c = 225; - else if (strcmpbeg(s, "â") == 0) - c = 226; - else if (strcmpbeg(s, "ã") == 0) - c = 227; - else if (strcmpbeg(s, "ä") == 0) - c = 228; - else if (strcmpbeg(s, "å") == 0) - c = 229; - else if (strcmpbeg(s, "æ") == 0) - c = 230; - else if (strcmpbeg(s, "ç") == 0) - c = 231; - else if (strcmpbeg(s, "è") == 0) - c = 232; - else if (strcmpbeg(s, "é") == 0) - c = 233; - else if (strcmpbeg(s, "ê") == 0) - c = 234; - else if (strcmpbeg(s, "ë") == 0) - c = 235; - else if (strcmpbeg(s, "ì") == 0) - c = 236; - else if (strcmpbeg(s, "í") == 0) - c = 237; - else if (strcmpbeg(s, "î") == 0) - c = 238; - else if (strcmpbeg(s, "ï") == 0) - c = 239; - else if (strcmpbeg(s, "ð") == 0) - c = 240; - else if (strcmpbeg(s, "ñ") == 0) - c = 241; - else if (strcmpbeg(s, "ò") == 0) - c = 242; - else if (strcmpbeg(s, "ó") == 0) - c = 243; - else if (strcmpbeg(s, "ô") == 0) - c = 244; - else if (strcmpbeg(s, "õ") == 0) - c = 245; - else if (strcmpbeg(s, "ö") == 0) - c = 246; - else if (strcmpbeg(s, "÷") == 0) - c = 247; - else if (strcmpbeg(s, "ø") == 0) - c = 248; - else if (strcmpbeg(s, "ù") == 0) - c = 249; - else if (strcmpbeg(s, "ú") == 0) - c = 250; - else if (strcmpbeg(s, "û") == 0) - c = 251; - else if (strcmpbeg(s, "ü") == 0) - c = 252; - else if (strcmpbeg(s, "ý") == 0) - c = 253; - else if (strcmpbeg(s, "þ") == 0) - c = 254; - else if (strcmpbeg(s, "ÿ") == 0) - c = 255; - // - else if (strcmpbeg(s, "&") == 0) - c = '&'; - else if (strcmpbeg(s, ">") == 0) - c = '>'; - else if (strcmpbeg(s, "«") == 0) - c = '\"'; - else if (strcmpbeg(s, "<") == 0) - c = '<'; - else if (strcmpbeg(s, " ") == 0) - c = ' '; - else if (strcmpbeg(s, """) == 0) - c = '\"'; - else if (strcmpbeg(s, "»") == 0) - c = '\"'; - else if (strcmpbeg(s, "­") == 0) - c = '-'; - else if (strcmpbeg(s, "˜") == 0) - c = '~'; - // remplacer? - if (c) { - char BIGSTK buff[HTS_URLMAXSIZE * 2]; - - buff[0] = (char) c; - strcpybuff(buff + 1, end + 1); - strcpybuff(s, buff); - } - } - } - s++; - } -} - static int ehexh(char c) { if ((c >= '0') && (c <= '9')) return c - '0'; @@ -4000,6 +3752,12 @@ static int ehex(const char *s) { return 16 * ehexh(*s) + ehexh(*(s + 1)); } +void unescape_amp(char *s) { + if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) { + assertf(! "error escaping html entities"); + } +} + // remplacer %20 par ' ', | par : etc.. // buffer MAX 1Ko HTSEXT_API char *unescape_http(char *catbuff, const char *s) { |