/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Important notes:
- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.
Please visit our Website: http://www.httrack.com
*/
/* ------------------------------------------------------------ */
/* File: htsparse.c parser */
/* html/javascript/css parser */
/* and other parser routines */
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
/* Internal engine bytecode */
#define HTS_INTERNAL_BYTECODE
#include
#include
/* File defs */
#include "htscore.h"
/* specific definitions */
#include "htsbase.h"
#include "htsnet.h"
#include "htsbauth.h"
#include "htsmd5.h"
#include "htsindex.h"
#include "htscharset.h"
#include "htsencoding.h"
/* external modules */
#include "htsmodules.h"
// htswrap_add
#include "htswrap.h"
// parser
#include "htsparse.h"
#include "htsback.h"
// arrays
#include "htsarrays.h"
/** Append bytes to the output buffer up to the pointer 'html'. **/
#define HT_add_adr do { \
if ( (opt->getmode & 1) != 0 && ptr > 0 ) { \
const size_t sz_ = html - lastsaved; \
if (sz_ != 0) { \
TypedArrayAppend(output_buffer, lastsaved, sz_); \
lastsaved = html; \
} \
} \
} while(0)
/** Append to the output buffer the string 'A'. **/
#define HT_ADD(A) TypedArrayAppend(output_buffer, A, strlen(A))
/** Append to the output buffer the string 'A', html-escaped. **/
#define HT_ADD_HTMLESCAPED_ANY(A, FUNCTION) do { \
if ((opt->getmode & 1) != 0 && ptr>0) { \
const char *const str_ = (A); \
size_t size_; \
/* & is the maximum expansion */ \
TypedArrayEnsureRoom(output_buffer, strlen(str_) * 5 + 1024); \
size_ = FUNCTION(str_, &TypedArrayTail(output_buffer), \
TypedArrayRoom(output_buffer)); \
TypedArraySize(output_buffer) += size_; \
} \
} while(0)
/** Append to the output buffer the string 'A', html-escaped for &. **/
#define HT_ADD_HTMLESCAPED(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print)
/**
* Append to the output buffer the string 'A', html-escaped for & and
* high chars.
**/
#define HT_ADD_HTMLESCAPED_FULL(A) HT_ADD_HTMLESCAPED_ANY(A, escape_for_html_print_full)
// does nothing
#define XH_uninit do {} while(0)
#define HT_ADD_END { \
int ok=0;\
if (TypedArraySize(output_buffer) != 0) { \
const size_t ht_len = TypedArraySize(output_buffer); \
const char *const ht_buff = TypedArrayElts(output_buffer); \
char digest[32+2];\
off_t fsize_old = fsize(fconv(OPT_GET_BUFF(opt),OPT_GET_BUFF_SIZE(opt),savename()));\
digest[0] = '\0';\
domd5mem(TypedArrayElts(output_buffer), ht_len, digest, 1);\
if (fsize_old == (off_t) ht_len) { \
int mlen = 0;\
char* mbuff;\
cache_readdata(cache,"//[HTML-MD5]//",savename(),&mbuff,&mlen);\
if (mlen) \
mbuff[mlen]='\0';\
if ((mlen == 32) && (strcmp(((mbuff!=NULL)?mbuff:""),digest)==0)) {\
ok=1;\
hts_log_print(opt, LOG_DEBUG, "File not re-written (md5): %s",savename());\
} else {\
ok=0;\
} \
}\
if (!ok) { \
file_notify(opt,urladr(), urlfil(), savename(), 1, 1, r->notmodified); \
fp=filecreate(&opt->state.strc, savename()); \
if (fp) { \
if (ht_len>0) {\
if (fwrite(ht_buff,1,ht_len,fp) != ht_len) { \
int fcheck;\
if ((fcheck=check_fatal_io_errno())) {\
opt->state.exit_xh=-1;\
}\
if (opt->log) { \
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to write HTML file %s", savename());\
if (fcheck) {\
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
}\
}\
}\
}\
fclose(fp); fp=NULL; \
if (strnotempty(r->lastmodified)) \
set_filetime_rfc822(savename(),r->lastmodified); \
} else {\
int fcheck;\
if ((fcheck=check_fatal_io_errno())) {\
hts_log_print(opt, LOG_ERROR, "Mirror aborted: disk full or filesystem problems"); \
opt->state.exit_xh=-1;\
}\
hts_log_print(opt, LOG_ERROR | LOG_ERRNO, "Unable to save file %s", savename());\
if (fcheck) {\
hts_log_print(opt, LOG_ERROR, "* * Fatal write error, giving up");\
}\
}\
} else {\
file_notify(opt,urladr(), urlfil(), savename(), 0, 0, r->notmodified); \
filenote(&opt->state.strc, savename(),NULL); \
}\
if (cache->ndx)\
cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename(),digest,(int)strlen(digest));\
} \
TypedArrayFree(output_buffer); \
}
#define HT_ADD_FOP
// COPY IN HTSCORE.C
#define HT_INDEX_END do { \
if (!makeindex_done) { \
if (makeindex_fp) { \
char BIGSTK tempo[1024]; \
if (makeindex_links == 1) { \
char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \
escape_uri_utf(makeindex_firstlink, link_escaped, sizeof(link_escaped)); \
sprintf(tempo,""CRLF,link_escaped); \
} else \
tempo[0]='\0'; \
hts_template_format(makeindex_fp,template_footer, \
"", \
tempo, /* EOF */ NULL \
); \
fflush(makeindex_fp); \
fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \
makeindex_fp=NULL; \
usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \
} \
} \
makeindex_done=1; /* ok c'est fait */ \
} while(0)
#define ENGINE_DEFINE_CONTEXT() \
ENGINE_DEFINE_CONTEXT_BASE(); \
/* */ \
htsblk* const r HTS_UNUSED = stre->r_; \
hash_struct* const hash HTS_UNUSED = stre->hash_; \
char* const codebase HTS_UNUSED = stre->codebase; \
char* const base HTS_UNUSED = stre->base; \
/* */ \
const char * const template_header HTS_UNUSED = stre->template_header_; \
const char * const template_body HTS_UNUSED = stre->template_body_; \
const char * const template_footer HTS_UNUSED = stre->template_footer_; \
/* */ \
HTS_UNUSED char* const makeindex_firstlink = stre->makeindex_firstlink_; \
/* */ \
/* */ \
int error = * stre->error_; \
int store_errpage = * stre->store_errpage_; \
/* */ \
int makeindex_done = *stre->makeindex_done_; \
FILE* makeindex_fp = *stre->makeindex_fp_; \
int makeindex_links = *stre->makeindex_links_; \
/* */ \
LLint stat_fragment = *stre->stat_fragment_; \
HTS_UNUSED TStamp makestat_time = stre->makestat_time; \
HTS_UNUSED FILE* makestat_fp = stre->makestat_fp
#define ENGINE_SET_CONTEXT() \
ENGINE_SET_CONTEXT_BASE(); \
/* */ \
error = * stre->error_; \
store_errpage = * stre->store_errpage_; \
/* */ \
makeindex_done = *stre->makeindex_done_; \
makeindex_fp = *stre->makeindex_fp_; \
makeindex_links = *stre->makeindex_links_; \
/* */ \
stat_fragment = *stre->stat_fragment_; \
makestat_time = stre->makestat_time; \
makestat_fp = stre->makestat_fp
#define ENGINE_LOAD_CONTEXT() \
ENGINE_DEFINE_CONTEXT()
#define ENGINE_SAVE_CONTEXT() \
ENGINE_SAVE_CONTEXT_BASE(); \
/* */ \
* stre->error_ = error; \
* stre->store_errpage_ = store_errpage; \
/* */ \
*stre->makeindex_done_ = makeindex_done; \
*stre->makeindex_fp_ = makeindex_fp; \
*stre->makeindex_links_ = makeindex_links; \
/* */ \
*stre->stat_fragment_ = stat_fragment
#define _FILTERS (*opt->filters.filters)
#define _FILTERS_PTR (opt->filters.filptr)
#define _ROBOTS ((robots_wizard*)opt->robotsptr)
/* Apply current *adr character for the script automate */
#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \
if (inscript) { \
int new_state_pos; \
new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*html]; \
if (new_state_pos < 0) { \
new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \
} \
assertf(new_state_pos >= 0); \
assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \
inscript_state_pos=new_state_pos; \
} \
} while(0)
/* Increment current pointer to 'steps' characters, modifying automate if necessary */
#define INCREMENT_CURRENT_ADR(steps) do { \
int steps__ = (int) ( steps ); \
while(steps__ > 0) { \
html++; \
AUTOMATE_LOOKUP_CURRENT_ADR(); \
steps__ --; \
} \
} while(0)
/* Main parser */
int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
char catbuff[CATBUFF_SIZE];
/* Load engine variables */
ENGINE_LOAD_CONTEXT();
{
char *cAddr = r->adr;
int cSize = (int) r->size;
hts_log_print(opt, LOG_DEBUG, "engine: preprocess-html: %s%s", urladr(),
urlfil());
if (RUN_CALLBACK4(opt, preprocess, &cAddr, &cSize, urladr(), urlfil()) == 1) {
r->adr = cAddr;
r->size = cSize;
}
}
if (RUN_CALLBACK4(opt, check_html, r->adr, (int) r->size, urladr(), urlfil())) {
FILE *fp = NULL; // fichier écrit localement
const char *html = r->adr; // pointeur (on parcours)
const char *lastsaved; // adresse du dernier octet sauvé + 1
hts_log_print(opt, LOG_DEBUG, "scanning file %s%s (%s)..", urladr(), urlfil(),
savename());
/* Hack to avoid NULL char problems with C syntax */
/* Yes, some bogus HTML pages can embed null chars
and therefore can not be properly handled if this hack is not done
*/
if (r->adr != NULL) {
size_t i;
for(i = 0 ; i < (size_t) r->size ; i++) {
if (r->adr[i] == '\0') {
r->adr[i] = ' ';
}
}
}
// Indexing!
#if HTS_MAKE_KEYWORD_INDEX
if (opt->kindex) {
if (index_keyword
(r->adr, r->size, r->contenttype, savename(),
StringBuff(opt->path_html_utf8))) {
hts_log_print(opt, LOG_DEBUG, "indexing file..done");
} else {
hts_log_print(opt, LOG_DEBUG, "indexing file..error!");
}
}
#endif
// Now, parsing
if ((opt->getmode & 1) && (ptr > 0)) { // récupérer les html sur disque
// créer le fichier html local
HT_ADD_FOP; // écrire peu à peu le fichier
}
if (!error) {
// output HTML
TypedArray(char) output_buffer = EMPTY_TYPED_ARRAY;
time_t user_interact_timestamp = 0;
int detect_title = 0; // détection du title
int back_add_stats = opt->state.back_add_stats;
const char *in_media = NULL; // in other media type (real media and so..)
int intag = 0; // on est dans un tag
int incomment = 0; // dans un ", /* EOF */ NULL);
} else
makeindex_done = -1; // fait, erreur
}
if (makeindex_fp != NULL) {
char BIGSTK tempo[HTS_URLMAXSIZE * 2];
char BIGSTK s[HTS_URLMAXSIZE * 2];
char *a = NULL;
char *b = NULL;
s[0] = '\0';
if (p > 0) {
a = strchr(html, '>');
if (a != NULL) {
a++;
while(is_space(*a))
a++; // sauter espaces & co
b = strchr(a, '<'); // prochain tag
}
}
if (lienrelatif
(tempo, heap(ptr)->sav,
concat(OPT_GET_BUFF(opt), OPT_GET_BUFF_SIZE(opt),
StringBuff(opt->path_html_utf8),
"index.html")) == 0) {
detect_title = 1; // ok détecté pour cette page!
makeindex_links++; // un de plus
strcpybuff(makeindex_firstlink, tempo);
//
/* Hack */
if (opt->mimehtml) {
strcpybuff(makeindex_firstlink,
"cid:primary/primary");
}
if ((b == a) || (a == NULL) || (b == NULL)) { // pas de titre
strcpybuff(s, tempo);
} else if ((b - a) < 256) {
b--;
while(is_space(*b))
b--;
strncpy(s, a, b - a + 1);
*(s + (b - a) + 1) = '\0';
}
// Decode title with encoding
if (str->page_charset_ != NULL
&& *str->page_charset_ != '\0') {
char *const sUtf =
hts_convertStringToUTF8(s, strlen(s), str->page_charset_);
if (sUtf != NULL) {
strcpy(s, sUtf);
free(sUtf);
}
}
// Body
inplace_escape_uri_utf(tempo, sizeof(tempo));
hts_template_format(makeindex_fp, template_body, tempo, s, /* EOF */ NULL);
}
}
}
}
}
} else if (heap(ptr)->depth < opt->depth) { // on a sauté level1+1 et level1
HT_INDEX_END;
}
} // if (opt->makeindex)
}
// FIN Construction index.html (sommaire)
/*
end -- index.html built here
*/
/* Parse */
if ((*html == '<') /* No starting tag */
&&(!inscript) /* Not in (java)script */
&&(!incomment) /* Not in comment (charset);
HT_ADD("\" />");
HT_ADD(eol);
}
}
}
}
// éliminer les is used somewhere else.. darn those browsers are dirty
*/
if (!strstr(html, "-->")) {
intag = 0;
incomment = 0;
intag_start_valid = 0;
intag_name = NULL;
}
}
#endif
}
//}
}
//else if (*adr==34) {
// inquote=(inquote?0:1);
//}
else if (intag || inscript || in_media) { // nous sommes dans un tag/commentaire, tester si on recoit un tag
int p_type = 0;
int p_nocatch = 0;
int p_searchMETAURL = 0; // chercher ..URL=
int add_class = 0; // ajouter .class
int add_class_dots_to_patch = 0; // number of '.' in code="x.y.z"
const char *p_flush = NULL;
// ------------------------------------------------------------
// parsing évolé
// ------------------------------------------------------------
if (((isalpha((unsigned char) *html)) || (*html == '/') || (inscript) || (in_media) || (inscriptgen))) { // sinon pas la peine de tester..
/* caractère de terminaison pour "miniparsing" javascript=.. ?
(ex: ) */
if (inscript_tag) {
if (inscript_tag_lastc) {
if (*html == inscript_tag_lastc) {
/* sortir */
inscript_tag = inscript = 0;
incomment = 0;
if (opt->parsedebug) {
HT_ADD("<@@ /inscript @@>");
}
}
}
}
/* automate */
AUTOMATE_LOOKUP_CURRENT_ADR();
// Note:
// Certaines pages ne respectent pas le html
// notamment les guillements ne sont pas fixés
// Nous sommes dans un tag, donc on peut faire un test plus
// large pour pouvoi prendre en compte ces particularités
// à vérifier: ACTION, CODEBASE, VRML
if (in_media) {
if (strcmp(in_media, "LNK") == 0) { // real media
p = 0;
valid_p = 1;
} else if (strcmp(in_media, "AAM") == 0) { // AAM
if (is_space((unsigned char) html[0])
&& !is_space((unsigned char) html[1])) {
const char *a = html + 1;
int n = 0;
int ok = 0;
int dot = 0;
while(n < HTS_URLMAXSIZE / 2 && a[n] != '\0'
&& (!is_space((unsigned char) a[n]) || !(ok = 1))
) {
if (a[n] == '.') {
dot = n;
}
n++;
}
if (ok && dot > 0) {
char BIGSTK tmp[HTS_URLMAXSIZE / 2 + 2];
tmp[0] = '\0';
strncat(tmp, a + dot + 1, n - dot - 1);
if (is_knowntype(opt, tmp) || ishtml_ext(tmp) != -1) {
html++;
p = 0;
valid_p = 1;
unquoted_script = 1;
}
}
}
}
} else if (ptr > 0) { /* pas première page 0 (primary) */
p = 0; // saut pour le nom de fichier: adresse nom fichier=adr+p
// ------------------------------
// détection d'écriture JavaScript.
// osons les obj.write et les obj.href=.. ! osons!
// note: inscript==1 donc on sautera après les \"
if (inscript) {
if (inscriptgen) { // on est déja dans un objet générant..
if (*html == scriptgen_q) { // fermeture des " ou '
if (*(html - 1) != '\\') { // non
inscriptgen = 0; // ok parsing terminé
}
}
} else {
const char *a = NULL;
char check_this_fking_line = 0; // parsing code javascript..
char must_be_terminated = 0; // caractère obligatoire de terminaison!
int token_size;
if (!(token_size = strfield(html, ".writeln"))) // détection ...objet.write[ln]("code html")...
token_size = strfield(html, ".write");
if (token_size) {
a = html + token_size;
while(is_realspace(*a))
a++; // sauter espaces
if (*a == '(') { // début parenthèse
check_this_fking_line = 2; // à parser!
must_be_terminated = ')';
a++; // sauter (
}
}
// euhh ??? ???
/* else if (strfield(adr,".href")) { // détection ...objet.href="...
a=adr+5;
while(is_realspace(*a)) a++; // sauter espaces
if (*a=='=') { // ohh un égal
check_this_fking_line=1; // à noter!
must_be_terminated=';'; // et si t'as oublié le ; tu sais pas coder
a++; // sauter =
}
} */
// on a un truc du genre instruction"code généré" dont on parse le code
if (check_this_fking_line) {
while(is_realspace(*a))
a++;
if ((*a == '\'') || (*a == '"')) { // départ de '' ou ""
const char *b;
scriptgen_q = *a; // quote
b = a + 1; // départ de la chaîne
// vérifier forme ("code") et pas ("code"+var), ingérable
do {
if (*a == scriptgen_q && *(a - 1) != '\\') // quote non slash
break; // sortie
else if (*a == 10 && *(a - 1) != '\\' /* LF and no continue (\) character */
&& (*(a - 1) != '\r' || *(a - 2) != '\\')) /* and not CRLF and no .. */
break;
else
a++; // caractère suivant
} while((a - b) < HTS_URLMAXSIZE / 2);
if (*a == scriptgen_q) { // fin du quote
a++;
while(is_realspace(*a))
a++;
if (*a == must_be_terminated) { // parenthèse fermante: ("..")
// bon, on doit parser une ligne javascript
// 1) si check.. ==1 alors c'est un nom de fichier direct, donc
// on fixe p sur le saut nécessaire pour atteindre le nom du fichier
// et le moteur se débrouillera ensuite tout seul comme un grand
// 2) si check==2 c'est un peu plus tordu car là on génére du
// code html au sein de code javascript au sein de code html
// dans ce cas on doit fixer un flag à un puis ensuite dans la boucle
// on devra parser les instructions standard comme debug > 1) && (opt->log != NULL)) {
char str[512];
str[0] = '\0';
strncatbuff(str, b, minimum((int) (a - b + 1), 32));
hts_log_print(opt, LOG_DEBUG,
"active code (%s) detected in javascript: %s",
(check_this_fking_line ==
2) ? "parse" : "pickup", str);
}
}
}
}
}
}
}
// fin detection code générant javascript vers html
// ------------------------------
// analyse proprement dite, A HREF=.. etc..
if (!p) {
// si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
if ((intag && (!inscript)) || inscriptgen) {
if ((*(html - 1) == '<') || (is_space(*(html - 1)))) { // etc) */
if (p == 0) {
int i = 0;
while((p == 0) && (strnotempty(hts_detect[i]))) {
p = rech_tageq(html, hts_detect[i]);
if (p) {
/* This is a temporary hack to avoid archive=foo.jar,bar.jar .. */
if (strcmp(hts_detect[i], "archive") == 0) {
archivetag_p = 1;
}
}
i++;
}
}
/* Tags supplémentaires en début à vérifier (