summaryrefslogtreecommitdiff
path: root/src/htsname.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/htsname.c')
-rw-r--r--src/htsname.c213
1 files changed, 159 insertions, 54 deletions
diff --git a/src/htsname.c b/src/htsname.c
index 56fa6a6..8af2062 100644
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -35,14 +35,15 @@ Please visit our Website: http://www.httrack.com
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
+/* Internal engine bytecode */
+#define HTS_INTERNAL_BYTECODE
+
#include "htsname.h"
/* specific definitions */
#include "htsbase.h"
#include "htstools.h"
#include "htsmd5.h"
-#include <stdio.h>
-#include <stdlib.h>
#include <ctype.h>
/* END specific definitions */
@@ -51,7 +52,7 @@ Please visit our Website: http://www.httrack.com
#define ADD_STANDARD_PATH \
{ /* ajout nom */\
- char buff[HTS_URLMAXSIZE*2];\
+ char BIGSTK buff[HTS_URLMAXSIZE*2];\
buff[0]='\0';\
strncatbuff(buff,start_pos,(int) (nom_pos - start_pos));\
url_savename_addstr(save,buff);\
@@ -59,7 +60,7 @@ Please visit our Website: http://www.httrack.com
#define ADD_STANDARD_NAME(shortname) \
{ /* ajout nom */\
- char buff[HTS_URLMAXSIZE*2];\
+ char BIGSTK buff[HTS_URLMAXSIZE*2];\
standard_name(buff,dot_pos,nom_pos,fil_complete,(shortname));\
url_savename_addstr(save,buff);\
}
@@ -78,13 +79,38 @@ static const char *hts_tbdev[] =
};
+#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \
+ int prev = _hts_in_html_parsing; \
+ while(back_pluggable_sockets_strict(back, back_max, opt) <= 0) { \
+ _hts_in_html_parsing = 6; \
+ /* Wait .. */ \
+ back_wait(back,back_max,opt,cache,0); \
+ /* Transfer rate */ \
+ engine_stats(); \
+ /* Refresh various stats */ \
+ HTS_STAT.stat_nsocket=back_nsoc(back,back_max); \
+ HTS_STAT.stat_errors=fspc(NULL,"error"); \
+ HTS_STAT.stat_warnings=fspc(NULL,"warning"); \
+ HTS_STAT.stat_infos=fspc(NULL,"info"); \
+ HTS_STAT.nbk=backlinks_done(liens,lien_tot,ptr); \
+ HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,back,back_max); \
+ /* Check */ \
+ if (!hts_htmlcheck_loop(back,back_max,-1,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \
+ return -1; \
+ } \
+ } \
+ _hts_in_html_parsing = prev; \
+} while(0)
+
// forme le nom du fichier à sauver (save) à partir de fil et adr
// système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html)
int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_adr,char* former_fil,char* referer_adr,char* referer_fil,httrackp* opt,lien_url** liens,int lien_tot,lien_back* back,int back_max,cache_back* cache,hash_struct* hash,int ptr,int numero_passe) {
- char newfil[HTS_URLMAXSIZE*2]; /* ="" */
- /*char normadr_[HTS_URLMAXSIZE*2];*/
- char normfil_[HTS_URLMAXSIZE*2];
+ char BIGSTK newfil[HTS_URLMAXSIZE*2]; /* ="" */
+ /*char BIGSTK normadr_[HTS_URLMAXSIZE*2];*/
+ char BIGSTK normadr_[HTS_URLMAXSIZE*2], normfil_[HTS_URLMAXSIZE*2];
+ int protocol = 0;
+ static const char* protocol_str[] = {"http", "https", "ftp", "file", "unknown"};
char* normadr;
char* normfil;
char* fil;
@@ -100,11 +126,11 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
/* 8-3 ? */
switch(opt->savename_83) {
- case 1:
+ case 1: // 8-3
max_char=8;
break;
- case 2:
- max_char=30;
+ case 2: // Level 2 File names may be up to 31 characters.
+ max_char=31;
break;
default:
max_char=8;
@@ -130,13 +156,33 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// www-42.foo.com -> foo.com
// foo.com/bar//foobar -> foo.com/bar/foobar
if (opt->urlhack) {
- // copy of adr (withiotu protocol), used for lookups (see urlhack)
- normadr=jump_normalized(adr);
+ // copy of adr (without protocol), used for lookups (see urlhack)
+ normadr=adr_normalized(adr, normadr_);
normfil=fil_normalized(fil,normfil_);
+ } else {
+ if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder
+ char* pos = strchr(adr_complete, ':');
+ if (pos != NULL) {
+ normadr_[0] = '\0';
+ strncatbuff(normadr_, adr_complete, (int)(pos - adr_complete));
+ strcatbuff(normadr_, "://");
+ strcatbuff(normadr_, normadr);
+ normadr=normadr_;
+ }
+ }
}
// à afficher sans ftp://
print_adr=jump_protocol(adr);
+ if (strfield(adr_complete, "https:")) {
+ protocol = 1;
+ } else if (strfield(adr_complete, "ftp:")) {
+ protocol = 2;
+ } else if (strfield(adr_complete, "file:")) {
+ protocol = 3;
+ } else {
+ protocol = 0;
+ }
// court-circuit pour lien primaire
if (strnotempty(adr)==0) {
@@ -199,7 +245,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// chercher sans / ou avec / dans former
{
- char fil_complete_patche[HTS_URLMAXSIZE*2];
+ char BIGSTK fil_complete_patche[HTS_URLMAXSIZE*2];
strcpybuff(fil_complete_patche,normfil);
// Version avec ou sans /
if (fil_complete_patche[strlen(fil_complete_patche)-1]=='/')
@@ -254,30 +300,13 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
}
// décoder %
strcpybuff(fil,unescape_http(fil));
- /*
- {
- char tempo[HTS_URLMAXSIZE*2];
- int i,j=0;
- for (i=0;i<(int) strlen(fil);i++) {
- if (fil[i]=='%') {
- i++;
- tempo[j++]=(char) ehex(fil+i);
- i++; // sauter 2 caractères finalement
- } else
- tempo[j++]=fil[i];
- }
- tempo[j++]='\0';
- strcpybuff(fil,tempo);
- }
- */
-
-
+
/* replace shtml to html.. */
switch (ishtml(fil)) { /* .html,.shtml,.. */
case 1:
if (
- (strcmp(get_ext(fil),"html") != 0)
- && (strcmp(get_ext(fil),"htm") != 0)
+ (strfield2(get_ext(fil),"html") == 0)
+ && (strfield2(get_ext(fil),"htm") == 0)
) {
strcpybuff(ext,"html");
ext_chg=1;
@@ -286,7 +315,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
case 0:
if (!strnotempty(ext)) {
if (is_userknowntype(get_ext(fil))) { // mime known by user
- char mime[1024];
+ char BIGSTK mime[1024];
mime[0]=ext[0]='\0';
get_userhttptype(0,mime,get_ext(fil));
if (strnotempty(mime)) {
@@ -330,13 +359,35 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
}
}
//
+ } else if (is_userknowntype(fil)) { /* PATCH BY BRIAN SCHRÖDER.
+ Lookup mimetype not only by extension,
+ but also by filename */
+ /* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type,
+ that is, ".html" */
+ char BIGSTK mime[1024];
+ mime[0]=ext[0]='\0';
+ get_userhttptype(0, mime, fil);
+ if (strnotempty(mime)) {
+ give_mimext(ext, mime);
+ if (strnotempty(ext)) {
+ ext_chg=1;
+ }
+ }
} else { // test imposible dans le cache, faire une requête
//
#if HTS_ANALYSTE
int hihp=_hts_in_html_parsing;
#endif
int has_been_moved=0;
- char curr_adr[HTS_URLMAXSIZE*2],curr_fil[HTS_URLMAXSIZE*2];
+ char BIGSTK curr_adr[HTS_URLMAXSIZE*2],curr_fil[HTS_URLMAXSIZE*2];
+
+ /* Ensure we don't use too many sockets by using a "testing" one
+ If we have only 1 simultaneous connection authorized, wait for pending download
+ Wait for an available slot
+ */
+ URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
+
+ /* Rock'in */
curr_adr[0]=curr_fil[0]='\0';
#if HTS_ANALYSTE
_hts_in_html_parsing=2; // test
@@ -383,7 +434,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
if (!hts_htmlcheck_loop(back,back_max,b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) {
return -1;
} else if (_hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing)
- back_delete(opt,back,b); // cancel test
+ back_delete(opt,cache,back,b); // cancel test
stop_looping = 1;
}
}
@@ -399,7 +450,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
) { // agh moved.. un tit tour de plus
if ((petits_tours<5) && (former_adr) && (former_fil)) { // on va pas tourner en rond non plus!
if ((int) strnotempty(back[b].r.location)) { // location existe!
- char mov_url[HTS_URLMAXSIZE*2],mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2];
+ char BIGSTK mov_url[HTS_URLMAXSIZE*2],mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2];
mov_url[0]=mov_adr[0]=mov_fil[0]='\0';
//
strcpybuff(mov_url,back[b].r.location); // copier URL
@@ -424,11 +475,12 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
robots_wizard* robots = (robots_wizard*) opt->robotsptr;
if (hts_acceptlink(opt,ptr,lien_tot,liens,
mov_adr,mov_fil,
+ NULL, NULL,
&set_prio_to,
NULL) == 1)
{ /* forbidden */
has_been_moved = 1;
- back_maydelete(opt,back,b); // ok
+ back_maydelete(opt,cache,back,b); // ok
strcpybuff(curr_adr,mov_adr);
strcpybuff(curr_fil,mov_fil);
mov_url[0]='\0';
@@ -439,7 +491,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// ftp: stop!
if (strfield(mov_url,"ftp://")) { // ftp, ok on arrête
has_been_moved = 1;
- back_maydelete(opt,back,b); // ok
+ back_maydelete(opt,cache,back,b); // ok
strcpybuff(curr_adr,mov_adr);
strcpybuff(curr_fil,mov_fil);
stop_looping = 1;
@@ -455,6 +507,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
}
}
// Ajouter
+ URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET();
if (back_add(back,back_max,opt,cache,mov_adr,mov_fil,methode,referer_adr,referer_fil,1,NULL)!=-1) { // OK
if ( (opt->debug>1) && (opt->errlog!=NULL) ) {
fspc(opt->errlog,"warning"); fprintf(opt->errlog,"(during prefetch) %s (%d) to link %s at %s%s"LF,back[b].r.msg,back[b].r.statuscode,back[b].r.location,curr_adr,curr_fil);
@@ -462,7 +515,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
}
// libérer emplacement backing actuel et attendre le prochain
- back_maydelete(opt,back,b);
+ back_maydelete(opt,cache,back,b);
strcpybuff(curr_adr,mov_adr);
strcpybuff(curr_fil,mov_fil);
b=back_index(back,back_max,curr_adr,curr_fil,methode);
@@ -507,7 +560,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
fspc(opt->errlog,0); fprintf(opt->errlog,"Error: (during prefetch) %s (%d) to link %s at %s%s"LF,back[b].r.msg,back[b].r.statuscode,back[b].r.location,curr_adr,curr_fil);
test_flush;
}
- back_delete(opt,back,b);
+ back_delete(opt,cache,back,b);
return -1; // ERREUR (404 par exemple)
*/
}
@@ -531,7 +584,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// FIN Si non déplacé, forcer type?
// libérer emplacement backing
- back_maydelete(opt,back,b);
+ back_maydelete(opt,cache,back,b);
// --- --- ---
// oops, a été déplacé.. on recalcule en récursif (osons!)
@@ -787,7 +840,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
if (!short_ver) { // Noms longs
strncatbuff(b,fil,(int) (nom_pos - fil) - 1);
} else {
- char pth[HTS_URLMAXSIZE*2],n83[HTS_URLMAXSIZE*2];
+ char BIGSTK pth[HTS_URLMAXSIZE*2],n83[HTS_URLMAXSIZE*2];
pth[0]=n83[0]='\0';
//
strncatbuff(pth,fil,(int) (nom_pos - fil) - 1);
@@ -816,7 +869,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
*b='\0';
{
char digest[32+2];
- char buff[HTS_URLMAXSIZE*2];
+ char BIGSTK buff[HTS_URLMAXSIZE*2];
digest[0]=buff[0]='\0';
strcpybuff(buff,adr);
strcatbuff(buff,fil_complete);
@@ -831,6 +884,11 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
strncatbuff(b,url_md5(fil_complete),(tok == 'Q')?32:4);
b+=strlen(b); // pointer à la fin
break;
+ case 'r': case 'R': // protocol
+ *b='\0';
+ strcatbuff(b, protocol_str[protocol]);
+ b+=strlen(b); // pointer à la fin
+ break;
}
} else
*b++=*a++;
@@ -1044,7 +1102,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
{
char* a=jump_identification(save);
if (a!=save) {
- char tempo[HTS_URLMAXSIZE*2];
+ char BIGSTK tempo[HTS_URLMAXSIZE*2];
char *b;
tempo[0]='\0';
strcpybuff(tempo,"[");
@@ -1061,7 +1119,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// éviter les / au début (cause: N100)
if (save[0]=='/') {
- char tempo[HTS_URLMAXSIZE*2];
+ char BIGSTK tempo[HTS_URLMAXSIZE*2];
strcpybuff(tempo,save+1);
strcpybuff(save,tempo);
}
@@ -1110,7 +1168,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
case '/':
case '.':
{
- char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0';
+ char BIGSTK tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0';
strncatbuff(tempo,save,(int) (a - save) + strlen(hts_tbdev[i]));
strcatbuff(tempo,"_");
strcatbuff(tempo,a+strlen(hts_tbdev[i]));
@@ -1123,15 +1181,57 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
i++;
}
}
+ /* Strip ending . or ' ' forbidden on windoz */
+ {
+ int len;
+ char* a=save;
+ while((a=strstr(a,"./"))) {
+ *a = '_';
+ }
+ a=save;
+ while((a=strstr(a," /"))) {
+ *a = '_';
+ }
+ len = (int) strlen(save);
+ if (len > 0 && ( save[len - 1] == '.' || save[len - 1] == ' ') ) {
+ save[len - 1] = '_';
+ }
+ }
#endif
// conversion 8-3 .. y compris pour les répertoires
if (opt->savename_83) {
- char n83[HTS_URLMAXSIZE*2];
+ char BIGSTK n83[HTS_URLMAXSIZE*2];
long_to_83(opt->savename_83,n83,save);
strcpybuff(save,n83);
}
+ // enforce stricter ISO9660 compliance (bug reported by Steffo Carlsson)
+ // Level 1 File names are restricted to 8 characters with a 3 character extension,
+ // upper case letters, numbers and underscore; maximum depth of directories is 8.
+ // This will be our "DOS mode"
+ // L2: 31 characters
+ // A-Z,0-9,_
+ if (opt->savename_83 > 0) {
+ char *a, *last;
+ for(last = save + strlen(save) - 1 ; last != save && *last != '/' && *last != '\\' && *last != '.' ; last--);
+ if (*last != '.') {
+ last = NULL;
+ }
+ for(a = save ; *a != '\0' ; a++) {
+ if (*a >= 'a' && *a <= 'z') {
+ *a -= 'a' - 'A';
+ }
+ else if (*a == '.') {
+ if (a != last) {
+ *a = '_';
+ }
+ }
+ else if ( ! ( (*a >= 'A' && *a <= 'Z') || (*a >= '0' && *a <= '9') || *a == '_' || *a == '/' || *a == '\\') ) {
+ *a = '_';
+ }
+ }
+ }
/* ensure that there is no ../ (potential vulnerability) */
fil_simplifie(save);
@@ -1148,7 +1248,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a
// chemin primaire éventuel A METTRE AVANT
if (strnotempty(opt->path_html)) {
- char tempo[HTS_URLMAXSIZE*2];
+ char BIGSTK tempo[HTS_URLMAXSIZE*2];
strcpybuff(tempo,opt->path_html);
strcatbuff(tempo,save);
strcpybuff(save,tempo);
@@ -1189,17 +1289,22 @@ printf("%cParse: %d",13,i);
#if HTS_CASSE
if ((strcmp(liens[i]->adr,adr)==0) && (strcmp(liens[i]->fil,fil_complete)==0))
#else
- if ((strfield2(liens[i]->adr,adr)) && (strfield2(liens[i]->fil,fil_complete)))
+ if ((strfield2(liens[i]->adr, normadr)) && (strfield2(liens[i]->fil, normfil)))
+ //if ((strfield2(liens[i]->adr,adr)) && (strfield2(liens[i]->fil,fil_complete)))
#endif
{ // ok c'est le même lien, adresse déja définie
- //printf("Ok, %s\n",save);
- //i=lien_tot; // sortir
+ /* Take the existing name not to screw up with cAsE sEnSiTiViTy of Linux/Unix */
+ if (strcmp(liens[i]->sav, save) != 0) {
+ strcpybuff(save, liens[i]->sav);
+ }
i=0;
#if DEBUG_SAVENAME
printf("\nOK ALREADY DEFINED\n",13,i);
#endif
+#if HTS_CASSE
+#endif
} else { // utilisé par un AUTRE, changer de nom
- char tempo[HTS_URLMAXSIZE*2];
+ char BIGSTK tempo[HTS_URLMAXSIZE*2];
char* a=save+strlen(save)-1;
char* b;
int n=2;
@@ -1310,7 +1415,7 @@ char* url_md5(char* fil_complete) {
a=strchr(fil_complete,'?');
if (a) {
if (strlen(a)) {
- char buff[HTS_URLMAXSIZE*2];
+ char BIGSTK buff[HTS_URLMAXSIZE*2];
a++;
digest[0]=buff[0]='\0';
strcatbuff(buff,a); /* query string MD5 */