diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:59:03 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:59:03 +0000 |
commit | 660b569b0980fc8f71b03ed666dd02eec8388b4c (patch) | |
tree | 8ad02b5f0bebdd4cd1d2ba01005d6f3f71a0a7fb /src/proxy/store.c | |
parent | 64cc4a88da8887ef1f7f4d90be0158d2cc76222d (diff) |
httrack 3.41.2
Diffstat (limited to 'src/proxy/store.c')
-rw-r--r-- | src/proxy/store.c | 822 |
1 files changed, 782 insertions, 40 deletions
diff --git a/src/proxy/store.c b/src/proxy/store.c index 1d17574..b8233a8 100644 --- a/src/proxy/store.c +++ b/src/proxy/store.c @@ -20,6 +20,8 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Please visit our Website: http://www.httrack.com */ +/* Parts (inside ARC format routines) by Lars Clausen (lc@statsbiblioteket.dk) */ + /* ------------------------------------------------------------ */ /* File: Cache manager for ProxyTrack */ /* Author: Xavier Roche */ @@ -28,6 +30,7 @@ Please visit our Website: http://www.httrack.com #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <time.h> /* Locking */ #ifdef _WIN32 @@ -40,8 +43,10 @@ Please visit our Website: http://www.httrack.com #define HTS_INTERNAL_BYTECODE #include "htsinthash.h" +#include "htsmd5.h" #undef HTS_INTERNAL_BYTECODE #include "../minizip/mztools.h" +#include "../minizip/zip.h" #include "htscore.h" #include "htsback.h" @@ -58,6 +63,8 @@ static PT_Element PT_ReadCache__New_u(PT_Index index, const char* url, int flags static int PT_LookupCache__Old_u(PT_Index index, const char* url); static PT_Element PT_ReadCache__Old_u(PT_Index index, const char* url, int flags); +static int PT_LookupCache__Arc_u(PT_Index index, const char* url); +static PT_Element PT_ReadCache__Arc_u(PT_Index index, const char* url, int flags); /* Locking */ @@ -100,10 +107,12 @@ void MutexFree(PT_Mutex *pMutex) { typedef struct _PT_Index__New _PT_Index__New; typedef struct _PT_Index__Old _PT_Index__Old; +typedef struct _PT_Index__Arc _PT_Index__Arc; typedef struct _PT_Index_Functions _PT_Index_Functions; typedef struct _PT_Index__New *PT_Index__New; typedef struct _PT_Index__Old *PT_Index__Old; +typedef struct _PT_Index__Arc *PT_Index__Arc; typedef struct _PT_Index_Functions *PT_Index_Functions; enum { @@ -111,29 +120,42 @@ enum { PT_CACHE_MIN = 0, PT_CACHE__NEW = PT_CACHE_MIN, PT_CACHE__OLD, - PT_CACHE_MAX = PT_CACHE__OLD + PT_CACHE__ARC, + PT_CACHE_MAX = PT_CACHE__ARC }; static int PT_LoadCache__New(PT_Index index, const char *filename); static void PT_Index_Delete__New(PT_Index *pindex); static PT_Element PT_ReadCache__New(PT_Index index, const char* url, int flags); static int PT_LookupCache__New(PT_Index index, const char* url); +static int PT_SaveCache__New(PT_Indexes indexes, const char *filename); /**/ static int PT_LoadCache__Old(PT_Index index, const char *filename); static void PT_Index_Delete__Old(PT_Index *pindex); static PT_Element PT_ReadCache__Old(PT_Index index, const char* url, int flags); static int PT_LookupCache__Old(PT_Index index, const char* url); +/**/ +static int PT_LoadCache__Arc(PT_Index index, const char *filename); +static void PT_Index_Delete__Arc(PT_Index *pindex); +static PT_Element PT_ReadCache__Arc(PT_Index index, const char* url, int flags); +static int PT_LookupCache__Arc(PT_Index index, const char* url); +static int PT_SaveCache__Arc(PT_Indexes indexes, const char *filename); struct _PT_Index_Functions { + /* Mandatory services */ int (*PT_LoadCache)(PT_Index index, const char *filename); void (*PT_Index_Delete)(PT_Index *pindex); PT_Element (*PT_ReadCache)(PT_Index index, const char* url, int flags); int (*PT_LookupCache)(PT_Index index, const char* url); + + /* Optional services */ + int (*PT_SaveCache)(PT_Indexes indexes, const char *filename); }; static _PT_Index_Functions _IndexFuncts[] = { - { PT_LoadCache__New, PT_Index_Delete__New, PT_ReadCache__New, PT_LookupCache__New }, - { PT_LoadCache__Old, PT_Index_Delete__Old, PT_ReadCache__Old, PT_LookupCache__Old }, + { PT_LoadCache__New, PT_Index_Delete__New, PT_ReadCache__New, PT_LookupCache__New, PT_SaveCache__New }, + { PT_LoadCache__Old, PT_Index_Delete__Old, PT_ReadCache__Old, PT_LookupCache__Old, NULL }, + { PT_LoadCache__Arc, PT_Index_Delete__Arc, PT_ReadCache__Arc, PT_LookupCache__Arc, PT_SaveCache__Arc }, { NULL, NULL, NULL, NULL } }; @@ -164,11 +186,22 @@ struct _PT_Index__Old { int safeCache; }; +struct _PT_Index__Arc { + PT_INDEX_COMMON_STRUCTURE; + FILE *file; + PT_Mutex fileLock; + int version; + char lastmodified[1024]; + char line[2048]; + char filenameIndexBuff[2048]; +}; + struct _PT_Index { int type; union { _PT_Index__New formatNew; _PT_Index__Old formatOld; + _PT_Index__Arc formatArc; struct { PT_INDEX_COMMON_STRUCTURE; } common; @@ -194,7 +227,7 @@ struct _PT_Cache { int count; }; -PT_Indexes PT_New() { +PT_Indexes PT_New(void) { PT_Indexes index = (PT_Indexes) calloc(sizeof(_PT_Indexes), 1); index->cil = inthash_new(127); index->index_size = 0; @@ -301,6 +334,16 @@ static void PT_Index_Delete__Old(PT_Index *pindex) { } } +static void PT_Index_Delete__Arc(PT_Index *pindex) { + if (pindex != NULL && (*pindex) != NULL) { + PT_Index__Arc index = &(*pindex)->slots.formatArc; + if (index->file != NULL) { + fclose(index->file); + } + MutexFree(&index->fileLock); + } +} + int PT_AddIndex(PT_Indexes indexes, const char *path) { PT_Index index = PT_LoadCache(path); if (index != NULL) { @@ -319,7 +362,7 @@ PT_Element PT_Index_HTML_BuildRootInfo(PT_Indexes indexes) { int i; String html = STRING_EMPTY; StringClear(html); - StringStrcat(html, + StringCat(html, "<html>" PROXYTRACK_COMMENT_HEADER DISABLE_IE_FRIENDLY_HTTP_ERROR_MESSAGES @@ -330,26 +373,26 @@ PT_Element PT_Index_HTML_BuildRootInfo(PT_Indexes indexes) { "<h3>Available sites in this cache:</h3><br />" "<br />" ); - StringStrcat(html, "<ul>\r\n"); + StringCat(html, "<ul>\r\n"); for(i = 0 ; i < indexes->index_size ; i++) { if (indexes->index[i] != NULL && indexes->index[i]->slots.common.startUrl[0] != '\0') { const char * url = indexes->index[i]->slots.common.startUrl; - StringStrcat(html, "<li>\r\n"); - StringStrcat(html, "<a href=\""); - StringStrcat(html, url); - StringStrcat(html, "\">"); - StringStrcat(html, url); - StringStrcat(html, "</a>\r\n"); - StringStrcat(html, "</li>\r\n"); + StringCat(html, "<li>\r\n"); + StringCat(html, "<a href=\""); + StringCat(html, url); + StringCat(html, "\">"); + StringCat(html, url); + StringCat(html, "</a>\r\n"); + StringCat(html, "</li>\r\n"); } } - StringStrcat(html, "</ul>\r\n"); - StringStrcat(html, "</body></html>\r\n"); + StringCat(html, "</ul>\r\n"); + StringCat(html, "</body></html>\r\n"); elt->size = StringLength(html); elt->adr = StringAcquire(&html); - elt->statuscode = 200; + elt->statuscode = HTTP_OK; strcpy(elt->charset, "iso-8859-1"); strcpy(elt->contenttype, "text/html"); strcpy(elt->msg, "OK"); @@ -404,9 +447,9 @@ char ** PT_Enumerate(PT_Indexes indexes, const char *url, int subtree) { char* ptr = NULL; ptr += StringLength(list); if (len > 0) - StringStrcat(list, StringBuff(subitem)); + StringCat(list, StringBuff(subitem)); if (isFolder) - StringStrcat(list, "/"); + StringCat(list, "/"); StringMemcat(list, "\0", 1); /* NULL terminated strings */ StringMemcat(listindexes, &ptr, sizeof(ptr)); listCount++; @@ -434,7 +477,7 @@ char ** PT_Enumerate(PT_Indexes indexes, const char *url, int subtree) { StringMemcat(listindexes, StringBuff(list), StringLength(list)); /* ---- no reallocation beyond this point (fixed addresses) ---- */ /* start of all strings (pointer) */ - startStrings = (startStrings - nullPointer) + StringBuff(listindexes); + startStrings = (startStrings - nullPointer) + StringBuffRW(listindexes); /* transform indexes into references */ for(i = 0 ; i < listCount ; i++) { char *ptr = NULL; @@ -442,7 +485,7 @@ char ** PT_Enumerate(PT_Indexes indexes, const char *url, int subtree) { memcpy(&ptr, &StringBuff(listindexes)[i*sizeof(char*)], sizeof(char*)); ndx = (unsigned int) (ptr - nullPointer); ptr = startStrings + ndx; - memcpy(&StringBuff(listindexes)[i*sizeof(char*)], &ptr, sizeof(char*)); + memcpy(&StringBuffRW(listindexes)[i*sizeof(char*)], &ptr, sizeof(char*)); } blk = StringAcquire(&listindexes); StringFree(list); @@ -460,16 +503,22 @@ void PT_Enumerate_Delete(char ***plist) { } } -PT_Index PT_LoadCache(const char *filename) { - int type = PT_CACHE_UNDEFINED; +static int PT_GetType(const char *filename) { char * dot = strrchr(filename, '.'); if (dot != NULL) { if (strcasecmp(dot, ".zip") == 0) { - type = PT_CACHE__NEW; + return PT_CACHE__NEW; } else if (strcasecmp(dot, ".ndx") == 0 || strcasecmp(dot, ".dat") == 0) { - type = PT_CACHE__OLD; + return PT_CACHE__OLD; + } else if (strcasecmp(dot, ".arc") == 0) { + return PT_CACHE__ARC; } } + return PT_CACHE_UNDEFINED; +} + +PT_Index PT_LoadCache(const char *filename) { + int type = PT_GetType(filename); if (type != PT_CACHE_UNDEFINED) { PT_Index index = calloc(sizeof(_PT_Index), 1); if (index != NULL) { @@ -524,6 +573,44 @@ int PT_LookupCache(PT_Index index, const char* url) { return 0; } +int PT_SaveCache(PT_Indexes indexes, const char *filename) { + int type = PT_GetType(filename); + if (type != PT_CACHE_UNDEFINED) { + if (_IndexFuncts[type].PT_SaveCache != NULL) { + int ret = _IndexFuncts[type].PT_SaveCache(indexes, filename); + if (ret == 0) { + (void) set_filetime_time_t(filename, PT_GetTimeIndex(indexes)); + return 0; + } + } + } + return -1; +} + +int PT_EnumCache(PT_Indexes indexes, int (*callback)(void *, const char *url, PT_Element), void *arg) { + if (indexes != NULL && indexes->cil != NULL) { + struct_inthash_enum en = inthash_enum_new(indexes->cil); + inthash_chain* chain; + while((chain = inthash_enum_next(&en))) { + const long int index_id = (long int)chain->value.intg; + const char *const url = chain->name; + if (index_id >= 0 && index_id <= indexes->index_size) { + PT_Element item = PT_ReadCache(indexes->index[index_id], url, FETCH_HEADERS | FETCH_BODY); + if (item != NULL) { + int ret = callback(arg, url, item); + PT_Element_Delete(&item); + if (ret != 0) + return ret; + } + } else { + CRITICAL("PT_ReadCache:Corrupted central index locator"); + return -1; + } + } + } + return 0; +} + time_t PT_Index_Timestamp(PT_Index index) { return index->slots.common.timestamp; } @@ -569,8 +656,8 @@ int PT_IndexMerge(PT_Indexes indexes, PT_Index *pindex) while((chain = inthash_enum_next(&en)) != NULL) { const char * url = chain->name; if (url != NULL && url[0] != '\0') { - long int previous_index_id = 0; - if (inthash_read(indexes->cil, url, (long int*)&previous_index_id)) { + intptr_t previous_index_id = 0; + if (inthash_read(indexes->cil, url, &previous_index_id)) { if (previous_index_id >= 0 && previous_index_id < indexes->index_size) { if (indexes->index[previous_index_id]->slots.common.timestamp > index->slots.common.timestamp) // existing entry is newer break; @@ -616,14 +703,14 @@ PT_Element PT_ReadIndex(PT_Indexes indexes, const char* url, int flags) { if (indexes != NULL) { - long int index_id; + intptr_t index_id; if (strncmp(url, "http://", 7) == 0) url += 7; if (inthash_read(indexes->cil, url, &index_id)) { if (index_id >= 0 && index_id <= indexes->index_size) { PT_Element item = PT_ReadCache(indexes->index[index_id], url, flags); if (item != NULL) { - item->indexId = index_id; + item->indexId = (int) index_id; return item; } } else { @@ -637,7 +724,7 @@ PT_Element PT_ReadIndex(PT_Indexes indexes, const char* url, int flags) int PT_LookupIndex(PT_Indexes indexes, const char* url) { if (indexes != NULL) { - long int index_id; + intptr_t index_id; if (strncmp(url, "http://", 7) == 0) url += 7; if (inthash_read(indexes->cil, url, &index_id)) { @@ -651,6 +738,22 @@ int PT_LookupIndex(PT_Indexes indexes, const char* url) { return 0; } +time_t PT_GetTimeIndex(PT_Indexes indexes) { + if (indexes != NULL && indexes->index_size > 0) + { + int i; + time_t maxt = indexes->index[0]->slots.common.timestamp; + for(i = 1 ; i < indexes->index_size ; i++) { + const time_t currt = indexes->index[i]->slots.common.timestamp; + if (currt > maxt) { + maxt = currt; + } + } + return maxt; + } + return (time_t) -1; +} + PT_Index PT_GetIndex(PT_Indexes indexes, int indexId) { if (indexes != NULL && indexId >= 0 && indexId < indexes->index_size) { @@ -659,7 +762,7 @@ PT_Index PT_GetIndex(PT_Indexes indexes, int indexId) { return NULL; } -PT_Element PT_ElementNew() { +PT_Element PT_ElementNew(void) { PT_Element r = NULL; if ((r = calloc(sizeof(_PT_Element), 1)) == NULL) return NULL; @@ -690,6 +793,22 @@ static PT_Element PT_ReadCache__New(PT_Index index, const char* url, int flags) /* New HTTrack cache (new.zip) format */ /* ------------------------------------------------------------ */ +#define ZIP_FIELD_STRING(headers, headersSize, field, value) do { \ + if ( (value != NULL) && (value)[0] != '\0') { \ + sprintf(headers + headersSize, "%s: %s\r\n", field, (value != NULL) ? (value) : ""); \ + (headersSize) += (int) strlen(headers + headersSize); \ + } \ +} while(0) +#define ZIP_FIELD_INT(headers, headersSize, field, value) do { \ + if ( (value != 0) ) { \ + sprintf(headers + headersSize, "%s: "LLintP"\r\n", field, (LLint)(value)); \ + (headersSize) += (int) strlen(headers + headersSize); \ + } \ +} while(0) +#define ZIP_FIELD_INT_FORCE(headers, headersSize, field, value) do { \ + sprintf(headers + headersSize, "%s: "LLintP"\r\n", field, (LLint)(value)); \ + (headersSize) += (int) strlen(headers + headersSize); \ +} while(0) #define ZIP_READFIELD_STRING(line, value, refline, refvalue) do { \ if (line[0] != '\0' && strfield2(line, refline)) { \ strcpy(refvalue, value); \ @@ -821,7 +940,8 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag char location_default[HTS_URLMAXSIZE*2]; char previous_save[HTS_URLMAXSIZE*2]; char previous_save_[HTS_URLMAXSIZE*2]; - long int hash_pos; + char catbuff[CATBUFF_SIZE]; + intptr_t hash_pos; int hash_pos_return; PT_Element r = NULL; if (index == NULL || index->hash == NULL || index->zFile == NULL || url == NULL || *url == 0) @@ -835,7 +955,7 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag strcpy(r->location, ""); if (strncmp(url, "http://", 7) == 0) url += 7; - hash_pos_return = inthash_read(index->hash, url, (long int*)&hash_pos); + hash_pos_return = inthash_read(index->hash, url, &hash_pos); if (hash_pos_return) { uLong posInZip; @@ -888,6 +1008,16 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag //ZIP_READFIELD_STRING(line, value, "X-Addr", ..); // Original address //ZIP_READFIELD_STRING(line, value, "X-Fil", ..); // Original URI filename ZIP_READFIELD_STRING(line, value, "X-Save", previous_save_); // Original save filename + if (line[0] != '\0') { + int len = r->headers ? ((int) strlen(r->headers)) : 0; + int nlen = (int) ( strlen(line) + 2 + strlen(value) + sizeof("\r\n") + 1 ); + r->headers = realloc(r->headers, len + nlen); + r->headers[len] = '\0'; + strcat(r->headers, line); + strcat(r->headers, ": "); + strcat(r->headers, value); + strcat(r->headers, "\r\n"); + } } } while(offset < readSizeHeader && !lineEof); totalHeader = offset; @@ -955,13 +1085,14 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag /* Read in memory from cache */ if (flags & FETCH_BODY) { if (strnotempty(previous_save)) { - FILE* fp = fopen(fconv(previous_save), "rb"); + FILE* fp = fopen(fconv(catbuff,previous_save), "rb"); if (fp != NULL) { r->adr = (char*) malloc(r->size + 4); if (r->adr != NULL) { if (r->size > 0 && fread(r->adr, 1, r->size, fp) != r->size) { + int last_errno = errno; r->statuscode=STATUSCODE_INVALID; - sprintf(r->msg,"Read error in cache disk data: %s", strerror(errno)); + sprintf(r->msg,"Read error in cache disk data: %s", strerror(last_errno)); } } else { r->statuscode=STATUSCODE_INVALID; @@ -970,7 +1101,7 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag fclose(fp); } else { r->statuscode=STATUSCODE_INVALID; - sprintf(r->msg, "Read error (can't open '%s') from cache", fconv(previous_save)); + sprintf(r->msg, "Read error (can't open '%s') from cache", fconv(catbuff,previous_save)); } } else { r->statuscode=STATUSCODE_INVALID; @@ -982,7 +1113,7 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag if (flags & FETCH_BODY) { r->adr=(char*) malloc(r->size+1); if (r->adr!=NULL) { - if (unzReadCurrentFile(index->zFile, r->adr, r->size) != r->size) { // erreur + if (unzReadCurrentFile(index->zFile, r->adr, (unsigned int) r->size) != r->size) { // erreur free(r->adr); r->adr=NULL; r->statuscode=STATUSCODE_INVALID; @@ -1024,6 +1155,121 @@ static PT_Element PT_ReadCache__New_u(PT_Index index_, const char* url, int flag return r; } +static int PT_SaveCache__New_Fun(void *arg, const char *url, PT_Element element) { + zipFile zFileOut = (zipFile) arg; + char headers[8192]; + int headersSize; + zip_fileinfo fi; + int zErr; + const char *url_adr = ""; + const char *url_fil = ""; + + headers[0] = '\0'; + headersSize = 0; + + /* Fields */ + headers[0] = '\0'; + headersSize = 0; + /* */ + { + char* message; + if (strlen(element->msg) < 32) { + message = element->msg; + } else { + message = "(See X-StatusMessage)"; + } + /* 64 characters MAX for first line */ + sprintf(headers + headersSize, "HTTP/1.%c %d %s\r\n", '1', element->statuscode, element->msg); + } + headersSize += (int) strlen(headers + headersSize); + + /* Second line MUST ALWAYS be X-In-Cache */ + ZIP_FIELD_INT_FORCE(headers, headersSize, "X-In-Cache", 1); + ZIP_FIELD_INT(headers, headersSize, "X-StatusCode", element->statuscode); + ZIP_FIELD_STRING(headers, headersSize, "X-StatusMessage", element->msg); + ZIP_FIELD_INT(headers, headersSize, "X-Size", element->size); // size + ZIP_FIELD_STRING(headers, headersSize, "Content-Type", element->contenttype); // contenttype + ZIP_FIELD_STRING(headers, headersSize, "X-Charset", element->charset); // contenttype + ZIP_FIELD_STRING(headers, headersSize, "Last-Modified", element->lastmodified); // last-modified + ZIP_FIELD_STRING(headers, headersSize, "Etag", element->etag); // Etag + ZIP_FIELD_STRING(headers, headersSize, "Location", element->location); // 'location' pour moved + ZIP_FIELD_STRING(headers, headersSize, "Content-Disposition", element->cdispo); // Content-disposition + ZIP_FIELD_STRING(headers, headersSize, "X-Addr", url_adr); // Original address + ZIP_FIELD_STRING(headers, headersSize, "X-Fil", url_fil); // Original URI filename + ZIP_FIELD_STRING(headers, headersSize, "X-Save", ""); // Original save filename + + /* Time */ + memset(&fi, 0, sizeof(fi)); + if (element->lastmodified[0] != '\0') { + struct tm buffer; + struct tm* tm_s = convert_time_rfc822(&buffer, element->lastmodified); + if (tm_s) { + fi.tmz_date.tm_sec = (uInt) tm_s->tm_sec; + fi.tmz_date.tm_min = (uInt) tm_s->tm_min; + fi.tmz_date.tm_hour = (uInt) tm_s->tm_hour; + fi.tmz_date.tm_mday = (uInt) tm_s->tm_mday; + fi.tmz_date.tm_mon = (uInt) tm_s->tm_mon; + fi.tmz_date.tm_year = (uInt) tm_s->tm_year; + } + } + + /* Open file - NOTE: headers in "comment" */ + if ((zErr = zipOpenNewFileInZip(zFileOut, + url, + &fi, + /* + Store headers in realtime in the local file directory as extra field + In case of crash, we'll be able to recover the whole ZIP file by rescanning it + */ + headers, + (uInt) strlen(headers), + NULL, + 0, + NULL, /* comment */ + Z_DEFLATED, + Z_DEFAULT_COMPRESSION)) != Z_OK) + { + int zip_zipOpenNewFileInZip_failed = 0; + assertf(zip_zipOpenNewFileInZip_failed); + } + + /* Write data in cache */ + if (element->size > 0 && element->adr != NULL) { + if ((zErr = zipWriteInFileInZip(zFileOut, element->adr, (int) element->size)) != Z_OK) { + int zip_zipWriteInFileInZip_failed = 0; + assertf(zip_zipWriteInFileInZip_failed); + } + } + + /* Close */ + if ((zErr = zipCloseFileInZip(zFileOut)) != Z_OK) { + int zip_zipCloseFileInZip_failed = 0; + assertf(zip_zipCloseFileInZip_failed); + } + + /* Flush */ + if ((zErr = zipFlush(zFileOut)) != 0) { + int zip_zipFlush_failed = 0; + assertf(zip_zipFlush_failed); + } + + return 0; +} + +static int PT_SaveCache__New(PT_Indexes indexes, const char *filename) { + zipFile zFileOut = zipOpen(filename, 0); + if (zFileOut != NULL) { + int ret = PT_EnumCache(indexes, PT_SaveCache__New_Fun, (void *) zFileOut); + zipClose(zFileOut, "Created by HTTrack Website Copier/ProxyTrack "PROXYTRACK_VERSION); + zFileOut = NULL; + if (ret != 0) + (void) unlink(filename); + return ret; + } + return -1; +} + + /* ------------------------------------------------------------ */ /* Old HTTrack cache (dat/ndx) format */ @@ -1167,7 +1413,7 @@ static int PT_LoadCache__Old(PT_Index index_, const char *filename) { /* */ } else { // Vieille version du cache /* */ - // fspc(opt->log,"warning"); fprintf(opt->log,"Cache: importing old cache format"LF); + // HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Cache: importing old cache format"LF); cache->version=0; // cache 1.0 strcpy(cache->lastmodified,firstline); } @@ -1257,7 +1503,7 @@ static PT_Element PT_ReadCache__Old(PT_Index index, const char* url, int flags) static PT_Element PT_ReadCache__Old_u(PT_Index index_, const char* url, int flags) { PT_Index__Old cache = (PT_Index__Old) &index_->slots.formatOld; - long int hash_pos; + intptr_t hash_pos; int hash_pos_return; char location_default[HTS_URLMAXSIZE*2]; char previous_save[HTS_URLMAXSIZE*2]; @@ -1276,7 +1522,7 @@ static PT_Element PT_ReadCache__Old_u(PT_Index index_, const char* url, int flag strcpy(r->location, ""); if (strncmp(url, "http://", 7) == 0) url += 7; - hash_pos_return=inthash_read(cache->hash, url, (long int*)&hash_pos); + hash_pos_return=inthash_read(cache->hash, url, &hash_pos); if (hash_pos_return) { int pos = (int) hash_pos; /* simply */ @@ -1325,10 +1571,12 @@ static PT_Element PT_ReadCache__Old_u(PT_Index index_, const char* url, int flag } else { char check[256]; unsigned long size_read; + unsigned long int size_; check[0]='\0'; // cache_rint(cache->dat,&r->statuscode); - cache_rLLint(cache->dat,&r->size); + cache_rLLint(cache->dat,&size_); + r->size = (size_t) size_; cache_rstr(cache->dat,r->msg); cache_rstr(cache->dat,r->contenttype); if (cache->version >= 3) @@ -1503,3 +1751,497 @@ static int PT_LookupCache__Old_u(PT_Index index_, const char* url) { return 0; } + +/* ------------------------------------------------------------ */ +/* Internet Archive Arc 1.0 (arc) format */ +/* Xavier Roche (roche@httrack.com) */ +/* Lars Clausen (lc@statsbiblioteket.dk) */ +/* ------------------------------------------------------------ */ + +#define ARC_SP ' ' + +static const char* getArcField(const char *line, int pos) { + int i; + for(i = 0 ; line[i] != '\0' && pos > 0 ; i++) { + if (line[i] == ARC_SP) + pos--; + } + if (pos == 0) + return &line[i]; + return NULL; +} + +static char* copyArcField(const char *line, int npos, char *dest, int destMax) { + const char *pos; + if ((pos = getArcField(line, npos)) != NULL) { + int i; + for(i = 0 ; pos[i] != '\0' && pos[i] != ARC_SP && ( --destMax ) > 0; i++) { + dest[i] = pos[i]; + } + dest[i] = 0; + return dest; + } + dest[0] = 0; + return NULL; +} + +static int getArcLength(const char *line) { + const char *pos; + if ((pos = getArcField(line, 9)) != NULL + || (pos = getArcField(line, 4)) != NULL + || (pos = getArcField(line, 2)) != NULL + ) { + int length; + if (sscanf(pos, "%d", &length) == 1) { + return length; + } + } + return -1; +} + +static int skipArcNl(FILE* file) { + if (fgetc(file) == 0x0a) { + return 0; + } + return -1; +} + +static int skipArcData(FILE* file, const char *line) { + int jump = getArcLength(line); + if (jump != -1) { + if (fseek(file, jump, SEEK_CUR) == 0 /* && skipArcNl(file) == 0 */) { + return 0; + } + } + return -1; +} + +static int getDigit(const char digit) { + return (int) ( digit - '0' ); +} + +static int getDigit2(const char * const pos) { + return getDigit(pos[0])*10 + getDigit(pos[1]); +} + +static int getDigit4(const char * const pos) { + return getDigit(pos[0])*1000 + getDigit(pos[1])*100 + getDigit(pos[2])*10 + getDigit(pos[3]); +} + +static time_t getGMT(struct tm *tm) { /* hey, time_t is local! */ + time_t t = mktime(tm); + if (t != (time_t) -1 && t != (time_t) 0) { + /* BSD does not have static "timezone" declared */ +#if (defined(BSD) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD_kernel__)) + time_t now = time(NULL); + time_t timezone = - localtime(&now)->tm_gmtoff; +#endif + return (time_t) (t - timezone); + } + return (time_t) -1; +} + +static time_t getArcTimestamp(const char * const line) { + const char *pos; + if ((pos = getArcField(line, 2)) != NULL) { + int i; + /* date == YYYYMMDDhhmmss (Greenwich Mean Time) */ + /* example: 20050405154029 */ + for(i = 0 ; pos[i] >= '0' && pos[i] <= '9' ; i++); + if (i == 14) { + struct tm tm; + memset(&tm, 0, sizeof(tm)); + tm.tm_year = getDigit4(pos + 0) - 1900; /* current year minus 1900 */ + tm.tm_mon = getDigit2(pos + 4) - 1; /* 0 – 11 */ + tm.tm_mday = getDigit2(pos + 6); /* 1 – 31 */ + tm.tm_hour = getDigit2(pos + 8); /* 0 – 23 */ + tm.tm_min = getDigit2(pos + 10); /* 0 – 59 */ + tm.tm_sec = getDigit2(pos + 12); /* 0 – 59 */ + tm.tm_isdst = 0; + return getGMT(&tm); + } + } + return (time_t) -1; +} + +static int readArcURLRecord(PT_Index__Arc index) { + index->line[0] = '\0'; + if (linput(index->file, index->line, sizeof(index->line) - 1)) { + return 0; + } + return -1; +} + +#define str_begins(str, sstr) ( strncmp(str, sstr, sizeof(sstr) - 1) == 0 ) +static int PT_CompatibleScheme(const char *url) { + return (str_begins(url, "http:") + || str_begins(url, "https:") + || str_begins(url, "ftp:") + || str_begins(url, "file:")); +} + +int PT_LoadCache__Arc(PT_Index index_, const char *filename) { + if (index_ != NULL && filename != NULL) { + PT_Index__Arc index = &index_->slots.formatArc; + index->timestamp = file_timestamp(filename); + MutexInit(&index->fileLock); + index->file = fopen(filename, "rb"); + + // Opened ? + if (index->file != NULL) { + inthash hashtable = index->hash; + if (readArcURLRecord(index) == 0) { + int entries = 0; + /* Read first line */ + if (strncmp(index->line, "filedesc://", sizeof("filedesc://") - 1) != 0) { + fprintf(stderr, "Unexpected bad signature #%s"LF, index->line); + fclose(index->file); + index->file = NULL; + return 0; + } + /* Timestamp */ + index->timestamp = getArcTimestamp(index->line); + /* Skip first entry */ + if (skipArcData(index->file, index->line) != 0 || skipArcNl(index->file) != 0) { + fprintf(stderr, "Unexpected bad data offset size first entry"LF); + fclose(index->file); + index->file = NULL; + return 0; + } + /* Read all meta-entries (not data) */ + while(!feof(index->file)) { + unsigned long int fpos = ftell(index->file); + if (skipArcNl(index->file) == 0 && readArcURLRecord(index) == 0) { + int length = getArcLength(index->line); + if (length >= 0) { + const char * filenameIndex = copyArcField(index->line, 0, + index->filenameIndexBuff, sizeof(index->filenameIndexBuff) - 1); /* can not be NULL */ + if (strncmp(filenameIndex, "http://", 7) == 0) { + filenameIndex += 7; + } + if (*filenameIndex != 0) { + if (skipArcData(index->file, index->line) != 0) { + fprintf(stderr, "Corrupted cache data entry #%d (truncated file?), aborting read"LF, (int)entries); + } + /*fprintf(stdout, "adding %s [%d]\n", filenameIndex, (int)fpos);*/ + if (PT_CompatibleScheme(index->filenameIndexBuff)) { + inthash_add(hashtable, filenameIndex, fpos); /* position of meta-data */ + entries++; + } + } else { + fprintf(stderr, "Corrupted cache meta entry #%d"LF, (int)entries); + } + } else { + fprintf(stderr, "Corrupted cache meta entry #%d, aborting read"LF, (int)entries); + break ; + } + } else { + break ; + } + } + + /* OK */ + return 1; + } else { + fprintf(stderr, "Bad file (empty ?)"LF); + } + } else { + fprintf(stderr, "Unable to open file"LF); + index = NULL; + } + } else { + fprintf(stderr, "Bad arguments"LF); + } + return 0; +} + +#define HTTP_READFIELD_STRING(line, value, refline, refvalue) do { \ + if (line[0] != '\0' && strfield2(line, refline)) { \ + strcpy(refvalue, value); \ + line[0] = '\0'; \ + } \ +} while(0) +#define HTTP_READFIELD_INT(line, value, refline, refvalue) do { \ + if (line[0] != '\0' && strfield2(line, refline)) { \ + int intval = 0; \ + sscanf(value, "%d", &intval); \ + (refvalue) = intval; \ + line[0] = '\0'; \ + } \ +} while(0) + +static PT_Element PT_ReadCache__Arc(PT_Index index, const char* url, int flags) { + PT_Element retCode; + MutexLock(&index->slots.formatArc.fileLock); + { + retCode = PT_ReadCache__Arc_u(index, url, flags); + } + MutexUnlock(&index->slots.formatArc.fileLock); + return retCode; +} + +static PT_Element PT_ReadCache__Arc_u(PT_Index index_, const char* url, int flags) +{ + PT_Index__Arc index = (PT_Index__Arc) &index_->slots.formatArc; + char location_default[HTS_URLMAXSIZE*2]; + intptr_t hash_pos; + int hash_pos_return; + PT_Element r = NULL; + if (index == NULL || index->hash == NULL || url == NULL || *url == 0) + return NULL; + if ((r = PT_ElementNew()) == NULL) + return NULL; + location_default[0] = '\0'; + memset(r, 0, sizeof(_PT_Element)); + r->location = location_default; + strcpy(r->location, ""); + if (strncmp(url, "http://", 7) == 0) + url += 7; + hash_pos_return = inthash_read(index->hash, url, &hash_pos); + + if (hash_pos_return) { + if (fseek(index->file, (long)hash_pos, SEEK_SET) == 0) { + if (skipArcNl(index->file) == 0 && readArcURLRecord(index) == 0) { + long int fposMeta = ftell(index->file); + int dataLength = getArcLength(index->line); + const char *pos; + + /* Read HTTP headers */ + /* HTTP/1.1 404 Not Found */ + if (linput(index->file, index->line, sizeof(index->line) - 1)) { + if ((pos = getArcField(index->line, 1)) != NULL) { + if (sscanf(pos, "%d", &r->statuscode) != 1) { + r->statuscode = STATUSCODE_INVALID; + } + } + if ((pos = getArcField(index->line, 2)) != NULL) { + r->msg[0] = '\0'; + strncat(r->msg, pos, sizeof(pos) - 1); + } + while (linput(index->file, index->line, sizeof(index->line) - 1) && index->line[0] != '\0') { + char* const line = index->line; + char* value = strchr(line, ':'); + if (value != NULL) { + *value = '\0'; + for( value++ ; *value == ' ' || *value == '\t' ; value++); + HTTP_READFIELD_INT(line, value, "Content-Length", r->size); // size + HTTP_READFIELD_STRING(line, value, "Content-Type", r->contenttype); // contenttype + HTTP_READFIELD_STRING(line, value, "Last-Modified", r->lastmodified); // last-modified + HTTP_READFIELD_STRING(line, value, "Etag", r->etag); // Etag + HTTP_READFIELD_STRING(line, value, "Location", r->location); // 'location' pour moved + HTTP_READFIELD_STRING(line, value, "Content-Disposition", r->cdispo); // Content-disposition + if (line[0] != '\0') { + int len = r->headers ? ((int) strlen(r->headers)) : 0; + int nlen = (int) ( strlen(line) + 2 + strlen(value) + sizeof("\r\n") + 1 ); + r->headers = realloc(r->headers, len + nlen); + r->headers[len] = '\0'; + strcat(r->headers, line); + strcat(r->headers, ": "); + strcat(r->headers, value); + strcat(r->headers, "\r\n"); + } + } + } + + /* FIXME charset */ + if (r->contenttype[0] != '\0') { + char *pos = strchr(r->contenttype, ';'); + if (pos != NULL) { + /*char *chs = strchr(pos, "charset=");*/ + /*HTTP_READFIELD_STRING(line, value, "X-Charset", r->charset);*/ + *pos = 0; + if ((pos = strchr(r->contenttype, ' ')) != NULL) { + *pos = 0; + } + } + } + + /* Read data */ + if (r->statuscode != STATUSCODE_INVALID) { /* Can continue */ + if (flags & FETCH_BODY) { + long int fposCurrent = ftell(index->file); + long int metaSize = fposCurrent - fposMeta; + long int fetchSize = (long int) r->size; + if (fetchSize <= 0) { + fetchSize = dataLength - metaSize; + } else if (fetchSize > dataLength - metaSize) { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg, "Cache Read Error : Truncated Data"); + } + r->size = 0; + if (r->statuscode != STATUSCODE_INVALID) { + r->adr = (char*) malloc(fetchSize); + if (r->adr != NULL) { + if (fetchSize > 0 && ( r->size = (int) fread(r->adr, 1, fetchSize, index->file) ) != fetchSize) { + int last_errno = errno; + r->statuscode=STATUSCODE_INVALID; + sprintf(r->msg,"Read error in cache disk data: %s", strerror(last_errno)); + } + } else { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg,"Read error (memory exhausted) from cache"); + } + } + } + } + + } else { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg, "Cache Read Error : Read Header Error"); + } + + } else { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg, "Cache Read Error : Read Header Error"); + } + } else { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg, "Cache Read Error : Seek Error"); + } + + } else { + r->statuscode=STATUSCODE_INVALID; + strcpy(r->msg,"File Cache Entry Not Found"); + } + if (r->location[0] != '\0') { + r->location = strdup(r->location); + } else { + r->location = NULL; + } + return r; +} + +static int PT_LookupCache__Arc(PT_Index index, const char* url) { + int retCode; + MutexLock(&index->slots.formatArc.fileLock); + { + retCode = PT_LookupCache__Arc_u(index, url); + } + MutexUnlock(&index->slots.formatArc.fileLock); + return retCode; +} + +static int PT_LookupCache__Arc_u(PT_Index index_, const char* url) { + if (index_ != NULL) { + PT_Index__New cache = (PT_Index__New) &index_->slots.formatNew; + if (cache == NULL || cache->hash == NULL || url == NULL || *url == 0) + return 0; + if (strncmp(url, "http://", 7) == 0) + url += 7; + if (inthash_read(cache->hash, url, NULL)) + return 1; + } + return 0; +} + +typedef struct PT_SaveCache__Arc_t { + PT_Indexes indexes; + FILE *fp; + time_t t; + char filename[64]; + struct tm buff; + char headers[8192]; + char md5[32 + 2]; +} PT_SaveCache__Arc_t; + +static int PT_SaveCache__Arc_Fun(void *arg, const char *url, PT_Element element) { + PT_SaveCache__Arc_t *st = (PT_SaveCache__Arc_t*) arg; + FILE * const fp = st->fp; + struct tm* tm = convert_time_rfc822(&st->buff, element->lastmodified); + int size_headers; + + sprintf(st->headers, + "HTTP/1.0 %d %s" "\r\n" + "X-Server: ProxyTrack " PROXYTRACK_VERSION "\r\n" + "Content-type: %s%s%s%s" "\r\n" + "Last-modified: %s" "\r\n" + "Content-length: %d" "\r\n" + , + element->statuscode, element->msg, + /**/ + element->contenttype, + (element->charset[0] ? "; charset=\"" : ""), + (element->charset[0] ? element->charset : ""), + (element->charset[0] ? "\"" : ""), + /**/ + element->lastmodified, + (int) element->size + ); + if (element->location != NULL && element->location[0] != '\0') { + sprintf(st->headers + strlen(st->headers), "Location: %s" "\r\n", element->location); + } + if (element->headers != NULL) { + if ( strlen(element->headers) < sizeof(st->headers) - strlen(element->headers) - 1 ) { + strcat(st->headers, element->headers); + } + } + strcat(st->headers, "\r\n"); + size_headers = (int) strlen(st->headers); + + /* doc == <nl><URL-record><nl><network_doc> */ + + /* Format: URL IP date mime result checksum location offset filename length */ + if (element->adr != NULL) { + domd5mem(element->adr, element->size, st->md5, 1); + } else { + strcpy(st->md5, "-"); + } + fprintf(fp, + /* nl */ + "\n" + /* URL-record */ + "%s%s %s %04d%02d%02d%02d%02d%02d %s %d %s %s %ld %s %ld" + /* nl */ + "\n", + /* args */ + ( link_has_authority(url) ? "" : "http://" ), url, + "0.0.0.0", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec, + element->contenttype, + element->statuscode, + st->md5, ( element->location ? element->location : "-" ), + (long int)ftell(fp), st->filename, + (long int)( size_headers + element->size )); + /* network_doc */ + if (fwrite(st->headers, 1, size_headers, fp) != size_headers + || ( element->size > 0 && fwrite(element->adr, 1, element->size, fp) != element->size ) + ) { + return 1; /* Error */ + } + + return 0; +} + +static int PT_SaveCache__Arc(PT_Indexes indexes, const char *filename) { + FILE *fp = fopen(filename, "wb"); + if (fp != NULL) { + PT_SaveCache__Arc_t st; + int ret; + time_t t = PT_GetTimeIndex(indexes); + struct tm tm = PT_GetTime(t); + + /* version-2-block == + filedesc://<path><sp><ip_address><sp><date><sp>text/plain<sp>200<sp>-<sp>-<sp>0<sp><filename><sp><length><nl> + 2<sp><reserved><sp><origin-code><nl> + URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>Archive-length<nl> + <nl> */ + const char* prefix = + "2 0 HTTrack Website Copier" "\n" + "URL IP-address Archive-Date Content-Type Result-code Checksum Location Offset Filename Archive-length" "\n" "\n"; + sprintf(st.filename, "httrack_%d.arc", (int) t); + fprintf(fp, "filedesc://%s 0.0.0.0 %04d%02d%02d%02d%02d%02d text/plain 200 - - 0 %s %d" "\n" + "%s", + st.filename, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, + st.filename, (int)strlen(prefix), prefix); + st.fp = fp; + st.indexes = indexes; + st.t = t; + ret = PT_EnumCache(indexes, PT_SaveCache__Arc_Fun, (void *)&st); + fclose(fp); + if (ret != 0) + (void) unlink(filename); + return ret; + } + return -1; +} |