From 844ecc37072d515513177c65a8c9dc35c9cdfc1a Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Mon, 19 Mar 2012 12:55:42 +0000 Subject: httrack 3.33.16 --- src/Makefile.am | 12 +- src/Makefile.in | 159 +- src/hts-indextmpl.h | 19 +- src/htsalias.c | 26 +- src/htsalias.h | 3 + src/htsback.c | 651 +++--- src/htsback.h | 19 +- src/htsbase.h | 62 +- src/htsbasenet.h | 31 +- src/htsbauth.c | 51 +- src/htsbauth.h | 7 +- src/htscache.c | 864 +++++++- src/htscache.h | 9 +- src/htscatchurl.c | 15 +- src/htscatchurl.h | 5 + src/htscore.c | 246 ++- src/htscore.h | 63 +- src/htscoremain.c | 365 ++-- src/htscoremain.h | 4 +- src/htsdefines.h | 9 + src/htsfilters.c | 6 +- src/htsfilters.h | 3 + src/htsftp.c | 139 +- src/htsftp.h | 6 +- src/htsglobal.h | 109 +- src/htshash.c | 10 +- src/htshash.h | 3 + src/htshelp.c | 40 +- src/htshelp.h | 3 + src/htsindex.c | 9 +- src/htsindex.h | 3 + src/htsinthash.c | 23 +- src/htsinthash.h | 7 +- src/htsjava.c | 11 +- src/htsjava.h | 4 +- src/htslib.c | 708 ++++--- src/htslib.h | 214 +- src/htsmd5.c | 25 +- src/htsmd5.h | 3 + src/htsmodules.c | 170 +- src/htsmodules.h | 10 +- src/htsname.c | 213 +- src/htsname.h | 3 + src/htsnet.h | 14 +- src/htsnostatic.c | 5 +- src/htsnostatic.h | 75 +- src/htsopt.h | 31 +- src/htsparse.c | 4682 +++++++++++++++++++++++--------------------- src/htsparse.h | 7 +- src/htsrobots.c | 5 +- src/htsrobots.h | 5 +- src/htsserver.c | 584 +++--- src/htsserver.h | 237 ++- src/htsstrings.h | 138 ++ src/htsthread.c | 160 +- src/htsthread.h | 27 +- src/htstools.c | 298 ++- src/htstools.h | 55 +- src/htsweb.c | 34 +- src/htsweb.h | 14 +- src/htswizard.c | 45 +- src/htswizard.h | 6 + src/htswrap.c | 3 + src/htswrap.h | 3 +- src/htszlib.c | 68 +- src/htszlib.h | 36 +- src/httrack-library.h | 8 + src/httrack.c | 90 +- src/httrack.h | 237 ++- src/md5.h | 19 +- src/minizip/ChangeLogUnzip | 55 + src/minizip/crypt.h | 132 ++ src/minizip/ioapi.c | 196 ++ src/minizip/ioapi.h | 78 + src/minizip/iowin32.c | 275 +++ src/minizip/iowin32.h | 21 + src/minizip/mztools.c | 287 +++ src/minizip/mztools.h | 31 + src/minizip/unzip.c | 1591 +++++++++++++++ src/minizip/unzip.h | 352 ++++ src/minizip/zip.c | 1199 ++++++++++++ src/minizip/zip.h | 239 +++ src/webhttrack | 89 +- src/webhttrack.dsp | 2 +- 84 files changed, 11712 insertions(+), 4063 deletions(-) create mode 100755 src/htsstrings.h create mode 100755 src/minizip/ChangeLogUnzip create mode 100644 src/minizip/crypt.h create mode 100644 src/minizip/ioapi.c create mode 100644 src/minizip/ioapi.h create mode 100644 src/minizip/iowin32.c create mode 100644 src/minizip/iowin32.h create mode 100644 src/minizip/mztools.c create mode 100644 src/minizip/mztools.h create mode 100644 src/minizip/unzip.c create mode 100644 src/minizip/unzip.h create mode 100644 src/minizip/zip.c create mode 100644 src/minizip/zip.h (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index c391d3f..bc74182 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -42,6 +42,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmd5.c htszlib.c htsnostatic.c htswrap.c \ htsmodules.c \ md5.c \ + minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \ hts-indextmpl.h htsalias.h htsback.h htsbase.h \ htsbasenet.h htsbauth.h htscache.h htscatchurl.h \ htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \ @@ -50,11 +51,16 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmodules.h htsname.h htsnet.h htsnostatic.h \ htsopt.h htsrobots.h htssystem.h htsthread.h \ htstools.h htswizard.h htswrap.h htszlib.h \ - httrack-library.h md5.h + htsstrings.h httrack-library.h \ + md5.h \ + minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h + -libhttrack_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) $(SOCKET_LIBS) +libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(DL_LIBS) $(SOCKET_LIBS) libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO) EXTRA_DIST = httrack.h webhttrack \ httrack.dsp httrack.dsw \ - webhttrack.dsp webhttrack.dsw + webhttrack.dsp webhttrack.dsw \ + minizip/ChangeLogUnzip minizip/iowin32.c minizip/iowin32.h + diff --git a/src/Makefile.in b/src/Makefile.in index 22590c6..aa5da2a 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -71,6 +71,7 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LDFLAGS = @LDFLAGS@ LFS_FLAG = @LFS_FLAG@ +LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LN_S = @LN_S@ @@ -179,6 +180,7 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmd5.c htszlib.c htsnostatic.c htswrap.c \ htsmodules.c \ md5.c \ + minizip/ioapi.c minizip/mztools.c minizip/unzip.c minizip/zip.c \ hts-indextmpl.h htsalias.h htsback.h htsbase.h \ htsbasenet.h htsbauth.h htscache.h htscatchurl.h \ htsconfig.h htscore.h htsparse.h htscoremain.h htsdefines.h \ @@ -187,15 +189,18 @@ libhttrack_la_SOURCES = htscore.c htsparse.c htsback.c htscache.c \ htsmodules.h htsname.h htsnet.h htsnostatic.h \ htsopt.h htsrobots.h htssystem.h htsthread.h \ htstools.h htswizard.h htswrap.h htszlib.h \ - httrack-library.h md5.h + htsstrings.h httrack-library.h \ + md5.h \ + minizip/crypt.h minizip/ioapi.h minizip/mztools.h minizip/unzip.h minizip/zip.h -libhttrack_la_LIBADD = $(THREADS_LIBS) $(DL_LIBS) $(SOCKET_LIBS) +libhttrack_la_LIBADD = $(THREADS_LIBS) $(ZLIB_LIBS) $(DL_LIBS) $(SOCKET_LIBS) libhttrack_la_LDFLAGS = -version-info $(VERSION_INFO) EXTRA_DIST = httrack.h webhttrack \ httrack.dsp httrack.dsw \ - webhttrack.dsp webhttrack.dsw + webhttrack.dsp webhttrack.dsw \ + minizip/ChangeLogUnzip minizip/iowin32.c minizip/iowin32.h subdir = src mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs @@ -209,7 +214,8 @@ am_libhttrack_la_OBJECTS = htscore.lo htsparse.lo htsback.lo htscache.lo \ htshelp.lo htsjava.lo htslib.lo htscoremain.lo htsname.lo \ htsrobots.lo htstools.lo htswizard.lo htsalias.lo htsthread.lo \ htsindex.lo htsbauth.lo htsmd5.lo htszlib.lo htsnostatic.lo \ - htswrap.lo htsmodules.lo md5.lo + htswrap.lo htsmodules.lo md5.lo ioapi.lo mztools.lo unzip.lo \ + zip.lo libhttrack_la_OBJECTS = $(am_libhttrack_la_OBJECTS) bin_PROGRAMS = httrack$(EXEEXT) htsserver$(EXEEXT) PROGRAMS = $(bin_PROGRAMS) @@ -243,7 +249,9 @@ am__depfiles_maybe = depfiles @AMDEP_TRUE@ ./$(DEPDIR)/htstools.Plo ./$(DEPDIR)/htsweb.Po \ @AMDEP_TRUE@ ./$(DEPDIR)/htswizard.Plo ./$(DEPDIR)/htswrap.Plo \ @AMDEP_TRUE@ ./$(DEPDIR)/htszlib.Plo ./$(DEPDIR)/httrack.Po \ -@AMDEP_TRUE@ ./$(DEPDIR)/md5.Plo +@AMDEP_TRUE@ ./$(DEPDIR)/ioapi.Plo ./$(DEPDIR)/md5.Plo \ +@AMDEP_TRUE@ ./$(DEPDIR)/mztools.Plo ./$(DEPDIR)/unzip.Plo \ +@AMDEP_TRUE@ ./$(DEPDIR)/zip.Plo COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ @@ -294,6 +302,10 @@ clean-libLTLIBRARIES: echo "rm -f \"$${dir}/so_locations\""; \ rm -f "$${dir}/so_locations"; \ done +ioapi.lo: minizip/ioapi.c +mztools.lo: minizip/mztools.c +unzip.lo: minizip/unzip.c +zip.lo: minizip/zip.c libhttrack.la: $(libhttrack_la_OBJECTS) $(libhttrack_la_DEPENDENCIES) $(LINK) -rpath $(libdir) $(libhttrack_la_LDFLAGS) $(libhttrack_la_OBJECTS) $(libhttrack_la_LIBADD) $(LIBS) binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) @@ -387,7 +399,11 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htswrap.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/htszlib.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/httrack.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ioapi.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/md5.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mztools.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unzip.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/zip.Plo@am__quote@ distclean-depend: -rm -rf ./$(DEPDIR) @@ -425,6 +441,138 @@ distclean-depend: @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< +ioapi.o: minizip/ioapi.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ioapi.o -MD -MP -MF "$(DEPDIR)/ioapi.Tpo" \ +@am__fastdepCC_TRUE@ -c -o ioapi.o `test -f 'minizip/ioapi.c' || echo '$(srcdir)/'`minizip/ioapi.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/ioapi.Tpo" "$(DEPDIR)/ioapi.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/ioapi.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/ioapi.c' object='ioapi.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/ioapi.Po' tmpdepfile='$(DEPDIR)/ioapi.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ioapi.o `test -f 'minizip/ioapi.c' || echo '$(srcdir)/'`minizip/ioapi.c + +ioapi.obj: minizip/ioapi.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ioapi.obj -MD -MP -MF "$(DEPDIR)/ioapi.Tpo" \ +@am__fastdepCC_TRUE@ -c -o ioapi.obj `if test -f 'minizip/ioapi.c'; then $(CYGPATH_W) 'minizip/ioapi.c'; else $(CYGPATH_W) '$(srcdir)/minizip/ioapi.c'`; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/ioapi.Tpo" "$(DEPDIR)/ioapi.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/ioapi.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/ioapi.c' object='ioapi.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/ioapi.Po' tmpdepfile='$(DEPDIR)/ioapi.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ioapi.obj `if test -f 'minizip/ioapi.c'; then $(CYGPATH_W) 'minizip/ioapi.c'; else $(CYGPATH_W) '$(srcdir)/minizip/ioapi.c'` + +ioapi.lo: minizip/ioapi.c +@am__fastdepCC_TRUE@ if $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ioapi.lo -MD -MP -MF "$(DEPDIR)/ioapi.Tpo" \ +@am__fastdepCC_TRUE@ -c -o ioapi.lo `test -f 'minizip/ioapi.c' || echo '$(srcdir)/'`minizip/ioapi.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/ioapi.Tpo" "$(DEPDIR)/ioapi.Plo"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/ioapi.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/ioapi.c' object='ioapi.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/ioapi.Plo' tmpdepfile='$(DEPDIR)/ioapi.TPlo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ioapi.lo `test -f 'minizip/ioapi.c' || echo '$(srcdir)/'`minizip/ioapi.c + +mztools.o: minizip/mztools.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mztools.o -MD -MP -MF "$(DEPDIR)/mztools.Tpo" \ +@am__fastdepCC_TRUE@ -c -o mztools.o `test -f 'minizip/mztools.c' || echo '$(srcdir)/'`minizip/mztools.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/mztools.Tpo" "$(DEPDIR)/mztools.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/mztools.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/mztools.c' object='mztools.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/mztools.Po' tmpdepfile='$(DEPDIR)/mztools.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mztools.o `test -f 'minizip/mztools.c' || echo '$(srcdir)/'`minizip/mztools.c + +mztools.obj: minizip/mztools.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mztools.obj -MD -MP -MF "$(DEPDIR)/mztools.Tpo" \ +@am__fastdepCC_TRUE@ -c -o mztools.obj `if test -f 'minizip/mztools.c'; then $(CYGPATH_W) 'minizip/mztools.c'; else $(CYGPATH_W) '$(srcdir)/minizip/mztools.c'`; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/mztools.Tpo" "$(DEPDIR)/mztools.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/mztools.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/mztools.c' object='mztools.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/mztools.Po' tmpdepfile='$(DEPDIR)/mztools.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mztools.obj `if test -f 'minizip/mztools.c'; then $(CYGPATH_W) 'minizip/mztools.c'; else $(CYGPATH_W) '$(srcdir)/minizip/mztools.c'` + +mztools.lo: minizip/mztools.c +@am__fastdepCC_TRUE@ if $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT mztools.lo -MD -MP -MF "$(DEPDIR)/mztools.Tpo" \ +@am__fastdepCC_TRUE@ -c -o mztools.lo `test -f 'minizip/mztools.c' || echo '$(srcdir)/'`minizip/mztools.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/mztools.Tpo" "$(DEPDIR)/mztools.Plo"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/mztools.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/mztools.c' object='mztools.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/mztools.Plo' tmpdepfile='$(DEPDIR)/mztools.TPlo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mztools.lo `test -f 'minizip/mztools.c' || echo '$(srcdir)/'`minizip/mztools.c + +unzip.o: minizip/unzip.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT unzip.o -MD -MP -MF "$(DEPDIR)/unzip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o unzip.o `test -f 'minizip/unzip.c' || echo '$(srcdir)/'`minizip/unzip.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/unzip.Tpo" "$(DEPDIR)/unzip.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/unzip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/unzip.c' object='unzip.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/unzip.Po' tmpdepfile='$(DEPDIR)/unzip.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unzip.o `test -f 'minizip/unzip.c' || echo '$(srcdir)/'`minizip/unzip.c + +unzip.obj: minizip/unzip.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT unzip.obj -MD -MP -MF "$(DEPDIR)/unzip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o unzip.obj `if test -f 'minizip/unzip.c'; then $(CYGPATH_W) 'minizip/unzip.c'; else $(CYGPATH_W) '$(srcdir)/minizip/unzip.c'`; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/unzip.Tpo" "$(DEPDIR)/unzip.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/unzip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/unzip.c' object='unzip.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/unzip.Po' tmpdepfile='$(DEPDIR)/unzip.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unzip.obj `if test -f 'minizip/unzip.c'; then $(CYGPATH_W) 'minizip/unzip.c'; else $(CYGPATH_W) '$(srcdir)/minizip/unzip.c'` + +unzip.lo: minizip/unzip.c +@am__fastdepCC_TRUE@ if $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT unzip.lo -MD -MP -MF "$(DEPDIR)/unzip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o unzip.lo `test -f 'minizip/unzip.c' || echo '$(srcdir)/'`minizip/unzip.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/unzip.Tpo" "$(DEPDIR)/unzip.Plo"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/unzip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/unzip.c' object='unzip.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/unzip.Plo' tmpdepfile='$(DEPDIR)/unzip.TPlo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unzip.lo `test -f 'minizip/unzip.c' || echo '$(srcdir)/'`minizip/unzip.c + +zip.o: minizip/zip.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT zip.o -MD -MP -MF "$(DEPDIR)/zip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o zip.o `test -f 'minizip/zip.c' || echo '$(srcdir)/'`minizip/zip.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/zip.Tpo" "$(DEPDIR)/zip.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/zip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/zip.c' object='zip.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/zip.Po' tmpdepfile='$(DEPDIR)/zip.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o zip.o `test -f 'minizip/zip.c' || echo '$(srcdir)/'`minizip/zip.c + +zip.obj: minizip/zip.c +@am__fastdepCC_TRUE@ if $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT zip.obj -MD -MP -MF "$(DEPDIR)/zip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o zip.obj `if test -f 'minizip/zip.c'; then $(CYGPATH_W) 'minizip/zip.c'; else $(CYGPATH_W) '$(srcdir)/minizip/zip.c'`; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/zip.Tpo" "$(DEPDIR)/zip.Po"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/zip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/zip.c' object='zip.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/zip.Po' tmpdepfile='$(DEPDIR)/zip.TPo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o zip.obj `if test -f 'minizip/zip.c'; then $(CYGPATH_W) 'minizip/zip.c'; else $(CYGPATH_W) '$(srcdir)/minizip/zip.c'` + +zip.lo: minizip/zip.c +@am__fastdepCC_TRUE@ if $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT zip.lo -MD -MP -MF "$(DEPDIR)/zip.Tpo" \ +@am__fastdepCC_TRUE@ -c -o zip.lo `test -f 'minizip/zip.c' || echo '$(srcdir)/'`minizip/zip.c; \ +@am__fastdepCC_TRUE@ then mv "$(DEPDIR)/zip.Tpo" "$(DEPDIR)/zip.Plo"; \ +@am__fastdepCC_TRUE@ else rm -f "$(DEPDIR)/zip.Tpo"; exit 1; \ +@am__fastdepCC_TRUE@ fi +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='minizip/zip.c' object='zip.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ depfile='$(DEPDIR)/zip.Plo' tmpdepfile='$(DEPDIR)/zip.TPlo' @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o zip.lo `test -f 'minizip/zip.c' || echo '$(srcdir)/'`minizip/zip.c + mostlyclean-libtool: -rm -f *.lo @@ -512,6 +660,7 @@ top_distdir = .. distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/minizip @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ list='$(DISTFILES)'; for file in $$list; do \ case $$file in \ diff --git a/src/hts-indextmpl.h b/src/hts-indextmpl.h index 29dd122..cddbefa 100644 --- a/src/hts-indextmpl.h +++ b/src/hts-indextmpl.h @@ -158,6 +158,13 @@ regen: " "LF\ " "LF +#define HTS_INDEX_BODYCAT \ + ""LF\ + " "LF\ + "
"LF\ + " %s"LF\ + " "LF + /* %s = INFO */ /* %s = META REFRESH IF ANY */ #define HTS_INDEX_FOOTER \ @@ -167,7 +174,7 @@ regen: "
"LF\ "
"LF\ "
"LF\ - " Mirror and index made by HTTrack Website Copier [XR&CO'2003]"LF\ + " Mirror and index made by HTTrack Website Copier [XR&CO'2005]"LF\ "
"LF\ " %s"LF\ " "LF\ @@ -186,7 +193,7 @@ regen: ""LF\ ""LF\ " "LF\ - " "LF\ + " "LF\ " "LF\ "
© 2003 Xavier Roche & other contributors - Web Design: Kauler Leto.© 2005 Xavier Roche & other contributors - Web Design: Kauler Leto.
"LF\ ""LF\ @@ -317,7 +324,7 @@ regen: " "LF\ "
"LF\ "
"LF\ - " Mirror and index made by HTTrack Website Copier [XR&CO'2003]"LF\ + " Mirror and index made by HTTrack Website Copier [XR&CO'2005]"LF\ "
"LF\ " %s"LF\ " "LF\ @@ -335,7 +342,7 @@ regen: ""LF\ ""LF\ " "LF\ - " "LF\ + " "LF\ " "LF\ "
© 2003 Xavier Roche & other contributors - Web Design: Kauler Leto.© 2005 Xavier Roche & other contributors - Web Design: Kauler Leto.
"LF\ ""LF\ @@ -476,7 +483,7 @@ regen: ""LF\ ""LF\ " "LF\ - " "LF\ + " "LF\ " "LF\ "
© 2003 Xavier Roche & other contributors - Web Design: Kauler Leto.© 2005 Xavier Roche & other contributors - Web Design: Kauler Leto.
"LF\ ""LF\ @@ -613,7 +620,7 @@ regen: ""LF\ ""LF\ " "LF\ - " "LF\ + " "LF\ " "LF\ "
© 2003 Xavier Roche & other contributors - Web Design: Kauler Leto.© 2005 Xavier Roche & other contributors - Web Design: Kauler Leto.
"LF\ ""LF\ diff --git a/src/htsalias.c b/src/htsalias.c index 1b65945..d2e09e1 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -35,12 +35,13 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ -#include -#include -#include +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsbase.h" #include "htsalias.h" #include "htsglobal.h" + void linput(FILE* fp,char* s,int max); void hts_lowcase(char* s); @@ -108,6 +109,7 @@ const char* hts_optalias[][4] = { {"host-control","-H","param",""}, {"extended-parsing","-%P","param",""}, {"near","-n","single",""}, + {"disable-security-limits","-%!","single",""}, {"test","-t","single",""}, {"list","-%L","param1",""}, {"urllist","-%S","param1",""}, @@ -115,7 +117,7 @@ const char* hts_optalias[][4] = { {"structure","-N","param",""}, {"user-structure","-N","param1",""}, {"long-names","-L","param",""}, {"keep-links","-K","param",""}, - {"mime-html","-%M","param",""}, {"mht","-%M","param",""}, + {"mime-html","-%M","single",""}, {"mht","-%M","single",""}, {"replace-external","-x","single",""}, {"disable-passwords","-%x","single",""},{"disable-password","-%x","single",""}, {"include-query-string","-%q","single",""}, @@ -135,6 +137,8 @@ const char* hts_optalias[][4] = { {"updatehack","-%s","single",""}, {"sizehack","-%s","single",""}, {"urlhack","-%u","single",""}, {"user-agent","-F","param1","user-agent identity"}, + {"referer","-%R","param1","default referer URL"}, + {"from","-%E","param1","from email address"}, {"footer","-%F","param1",""}, {"cache","-C","param","number of retries for non-fatal errors"}, {"store-all-in-cache","-k","single",""}, @@ -150,7 +154,7 @@ const char* hts_optalias[][4] = { {"priority","-p","param",""}, {"debug-headers","-%H","single",""}, {"userdef-cmd","-V","param1",""}, - {"callback","-%W","param1",""}, {"wrapper","-%W","param1",""}, + {"callback","-%W","param1","plug an external callback"}, {"wrapper","-%W","param1","plug an external callback"}, {"structure","-N","param1","user-defined structure"}, {"usercommand","-V","param1","user-defined command"}, {"display","-%v","single","show files transfered and other funny realtime information"}, @@ -185,7 +189,10 @@ const char* hts_optalias[][4] = { {"fast-engine","-#X","single","Enable fast routines"}, {"debug-overflows","-#X0","single","Attempt to detect buffer overflows"}, {"debug-cache","-#C","param1","List files in the cache"}, - + {"extract-cache","-#C","single","Extract meta-data"}, + {"debug-parsing","-#d","single","debug: test parser"}, + {"repair-cache","-#R","single","repair the damaged cache ZIP file"}, {"repair","-#R","single",""}, + /* STANDARD ALIASES */ {"spider","-p0C0I0t","single",""}, {"testsite","-p0C0I0t","single",""}, @@ -226,6 +233,7 @@ const char* hts_optalias[][4] = { {"updatehttrack","--updatehttrack","single","update HTTrack Website Copier"}, {"clean","--clean","single","clean up log files and cache"}, {"tide","--clean","single","clean up log files and cache"}, + {"autotest","-#T","single",""}, /* */ {"","","",""} @@ -342,7 +350,7 @@ int optalias_check(int argc,const char * const * argv,int n_arg, return need_param; } - /* Check -P */ + /* Check -O */ { int pos; if ((pos=optreal_find(argv[n_arg]))>=0) { @@ -514,17 +522,19 @@ int optinclude_file(const char* name, /* Get home directory, '.' if failed */ /* example: /home/smith */ char* hts_gethome(void) { +#ifndef _WIN32_WCE char* home = getenv( "HOME" ); if (home) return home; else +#endif return "."; } /* Convert ~/foo into /home/smith/foo */ void expand_home(char* str) { if (str[0] == '~') { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; strcpybuff(tempo,hts_gethome()); strcatbuff(tempo,str+1); strcpybuff(str,tempo); diff --git a/src/htsalias.h b/src/htsalias.h index e5e8f82..21c3142 100644 --- a/src/htsalias.h +++ b/src/htsalias.h @@ -39,6 +39,8 @@ Please visit our Website: http://www.httrack.com #ifndef HTSALIAS_DEFH #define HTSALIAS_DEFH +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE extern const char* hts_optalias[][4]; int optalias_check(int argc,const char * const * argv,int n_arg, int* return_argc,char** return_argv, @@ -54,5 +56,6 @@ const char* opttype_value(int p); const char* opthelp_value(int p); char* hts_gethome(void); void expand_home(char* str); +#endif #endif diff --git a/src/htsback.c b/src/htsback.c index 6d0b119..317d4e7 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -35,15 +35,15 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsback.h" /* specific definitions */ #include "htsbase.h" #include "htsnet.h" #include "htsthread.h" -#include -#include -#include #include /* END specific definitions */ @@ -51,14 +51,18 @@ Please visit our Website: http://www.httrack.com #include "htsftp.h" #if HTS_USEZLIB #include "htszlib.h" +#else +#error HTS_USEZLIB not defined #endif //#endif #if HTS_WIN #ifndef __cplusplus // DOS +#ifndef _WIN32_WCE #include /* _beginthread, _endthread */ #endif +#endif #else #endif @@ -142,228 +146,243 @@ int back_nsoc_overall(lien_back* back,int back_max) { // fermer les paramètres de transfert, // et notamment vérifier les fichiers compressés (décompresser), callback etc. int back_finalize(httrackp* opt,cache_back* cache,lien_back* back,int p) { - if ( + /* Don't store broken files */ + if (back[p].r.totalsize > 0 && back[p].r.size != back[p].r.totalsize && ! opt->tolerant) { + return -1; + } + + /* Store ? */ + if (!back[p].finalized) { + back[p].finalized = 1; + if ( (back[p].status == 0) // ready && - (!back[p].testmode) // not test mode - && (back[p].r.statuscode>0) // not internal error ) { - char* state="unknown"; - - /* décompression */ + if (!back[p].testmode) { // not test mode + char* state="unknown"; + + /* décompression */ #if HTS_USEZLIB - if (gz_is_available && back[p].r.compressed) { - if (back[p].r.size > 0) { - //if ( (back[p].r.adr) && (back[p].r.size>0) ) { - // stats - back[p].compressed_size=back[p].r.size; - // en mémoire -> passage sur disque - if (!back[p].r.is_write) { - back[p].tmpfile_buffer[0]='\0'; - back[p].tmpfile=tmpnam(back[p].tmpfile_buffer); - if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0') { - back[p].r.out=fopen(back[p].tmpfile,"wb"); - if (back[p].r.out) { - if ((back[p].r.adr) && (back[p].r.size>0)) { - if (fwrite(back[p].r.adr,1,(INTsys)back[p].r.size,back[p].r.out) != back[p].r.size) { + if (gz_is_available && back[p].r.compressed) { + if (back[p].r.size > 0) { + //if ( (back[p].r.adr) && (back[p].r.size>0) ) { + // stats + back[p].compressed_size=back[p].r.size; + // en mémoire -> passage sur disque + if (!back[p].r.is_write) { + back[p].tmpfile_buffer[0]='\0'; + back[p].tmpfile=tmpnam(back[p].tmpfile_buffer); + if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0') { + back[p].r.out=fopen(back[p].tmpfile,"wb"); + if (back[p].r.out) { + if ((back[p].r.adr) && (back[p].r.size>0)) { + if (fwrite(back[p].r.adr,1,(INTsys)back[p].r.size,back[p].r.out) != back[p].r.size) { + back[p].r.statuscode=-1; + strcpybuff(back[p].r.msg,"Write error when decompressing"); + } + } else { + back[p].tmpfile[0]='\0'; back[p].r.statuscode=-1; - strcpybuff(back[p].r.msg,"Write error when decompressing"); + strcpybuff(back[p].r.msg,"Empty compressed file"); } } else { back[p].tmpfile[0]='\0'; back[p].r.statuscode=-1; - strcpybuff(back[p].r.msg,"Empty compressed file"); + strcpybuff(back[p].r.msg,"Open error when decompressing"); } - } else { - back[p].tmpfile[0]='\0'; - back[p].r.statuscode=-1; - strcpybuff(back[p].r.msg,"Open error when decompressing"); } } - } - // fermer fichier sortie - if (back[p].r.out!=NULL) { - fclose(back[p].r.out); - back[p].r.out=NULL; - } - // décompression - if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0' && back[p].url_sav[0]) { - LLint size; - filecreateempty(back[p].url_sav); // filenote & co - if ((size = hts_zunpack(back[p].tmpfile,back[p].url_sav))>=0) { - back[p].r.size=back[p].r.totalsize=size; - // fichier -> mémoire - if (!back[p].r.is_write) { - deleteaddr(&back[p].r); - back[p].r.adr=readfile(back[p].url_sav); - if (!back[p].r.adr) { - back[p].r.statuscode=-1; - strcpybuff(back[p].r.msg,"Read error when decompressing"); + // fermer fichier sortie + if (back[p].r.out!=NULL) { + fclose(back[p].r.out); + back[p].r.out=NULL; + } + // décompression + if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0' && back[p].url_sav[0]) { + LLint size; + filecreateempty(back[p].url_sav); // filenote & co + if ((size = hts_zunpack(back[p].tmpfile,back[p].url_sav))>=0) { + back[p].r.size=back[p].r.totalsize=size; + // fichier -> mémoire + if (!back[p].r.is_write) { + deleteaddr(&back[p].r); + back[p].r.adr=readfile(back[p].url_sav); + if (!back[p].r.adr) { + back[p].r.statuscode=-1; + strcpybuff(back[p].r.msg,"Read error when decompressing"); + } + remove(back[p].url_sav); } - remove(back[p].url_sav); } + remove(back[p].tmpfile); } - remove(back[p].tmpfile); + // stats + HTS_STAT.total_packed+=back[p].compressed_size; + HTS_STAT.total_unpacked+=back[p].r.size; + HTS_STAT.total_packedfiles++; + // unflag } - // stats - HTS_STAT.total_packed+=back[p].compressed_size; - HTS_STAT.total_unpacked+=back[p].r.size; - HTS_STAT.total_packedfiles++; - // unflag } - } - back[p].r.compressed=0; + back[p].r.compressed=0; #endif - - /* Stats */ - if (cache->txt) { - char flags[32]; - char s[256]; - time_t tt; - struct tm* A; - tt=time(NULL); - A=localtime(&tt); - if (A == NULL) { - int localtime_returned_null=0; - assert(localtime_returned_null); - } - strftime(s,250,"%H:%M:%S",A); - flags[0]='\0'; - /* input flags */ - if (back[p].is_update) - strcatbuff(flags, "U"); // update request - else - strcatbuff(flags, "-"); - if (back[p].range_req_size) - strcatbuff(flags, "R"); // range request - else - strcatbuff(flags, "-"); - /* state flags */ - if (back[p].r.is_file) // direct to disk - strcatbuff(flags, "F"); - else - strcatbuff(flags, "-"); - /* output flags */ - if (!back[p].r.notmodified) - strcatbuff(flags, "M"); // modified - else - strcatbuff(flags, "-"); - if (back[p].r.is_chunk) // chunked - strcatbuff(flags, "C"); - else - strcatbuff(flags, "-"); - if (back[p].r.compressed) - strcatbuff(flags, "Z"); // gzip - else - strcatbuff(flags, "-"); - /* Err I had to split these.. */ - fprintf(cache->txt,"%s\t", s); - fprintf(cache->txt,LLintP"/", (LLint)back[p].r.size); - fprintf(cache->txt,LLintP,(LLint)back[p].r.totalsize); - fprintf(cache->txt,"\t%s\t",flags); - } - if (back[p].r.statuscode==200) { - if (back[p].r.size>=0) { - if (strcmp(back[p].url_fil,"/robots.txt") !=0 ) { - HTS_STAT.stat_bytes+=back[p].r.size; - HTS_STAT.stat_files++; + /* Stats */ + if (cache->txt) { + char flags[32]; + char s[256]; + time_t tt; + struct tm* A; + tt=time(NULL); + A=localtime(&tt); + if (A == NULL) { + int localtime_returned_null=0; + assert(localtime_returned_null); } - if ( (!back[p].r.notmodified) && (opt->is_update) ) { - HTS_STAT.stat_updated_files++; // page modifiée - if (opt->log!=NULL) { - fspc(opt->log,"info"); - if (back[p].is_update) { - fprintf(opt->log,"engine: transfer-status: link updated: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); - } else { - fprintf(opt->log,"engine: transfer-status: link added: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); - } - test_flush; + strftime(s,250,"%H:%M:%S",A); + + flags[0]='\0'; + /* input flags */ + if (back[p].is_update) + strcatbuff(flags, "U"); // update request + else + strcatbuff(flags, "-"); + if (back[p].range_req_size) + strcatbuff(flags, "R"); // range request + else + strcatbuff(flags, "-"); + /* state flags */ + if (back[p].r.is_file) // direct to disk + strcatbuff(flags, "F"); + else + strcatbuff(flags, "-"); + /* output flags */ + if (!back[p].r.notmodified) + strcatbuff(flags, "M"); // modified + else + strcatbuff(flags, "-"); + if (back[p].r.is_chunk) // chunked + strcatbuff(flags, "C"); + else + strcatbuff(flags, "-"); + if (back[p].r.compressed) + strcatbuff(flags, "Z"); // gzip + else + strcatbuff(flags, "-"); + /* Err I had to split these.. */ + fprintf(cache->txt,"%s\t", s); + fprintf(cache->txt,LLintP"/", (LLint)back[p].r.size); + fprintf(cache->txt,LLintP,(LLint)back[p].r.totalsize); + fprintf(cache->txt,"\t%s\t",flags); + } + if (back[p].r.statuscode==200) { + if (back[p].r.size>=0) { + if (strcmp(back[p].url_fil,"/robots.txt") !=0 ) { + HTS_STAT.stat_bytes+=back[p].r.size; + HTS_STAT.stat_files++; } - if (cache->txt) { - if (back[p].is_update) { - state="updated"; - } else { - state="added"; + if ( (!back[p].r.notmodified) && (opt->is_update) ) { + HTS_STAT.stat_updated_files++; // page modifiée + if (opt->log!=NULL) { + fspc(opt->log,"info"); + if (back[p].is_update) { + fprintf(opt->log,"engine: transfer-status: link updated: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); + } else { + fprintf(opt->log,"engine: transfer-status: link added: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); + } + test_flush; + } + if (cache->txt) { + if (back[p].is_update) { + state="updated"; + } else { + state="added"; + } + } + } else { + if ( (opt->debug>0) && (opt->log!=NULL) ) { + fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: link recorded: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); + test_flush; + } + if (cache->txt) { + if (opt->is_update) + state="untouched"; + else + state="added"; } } } else { if ( (opt->debug>0) && (opt->log!=NULL) ) { - fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: link recorded: %s%s -> %s"LF,back[p].url_adr,back[p].url_fil,back[p].url_sav); + fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: empty file? (%d, '%s'): %s%s"LF,back[p].r.statuscode,back[p].r.msg,back[p].url_adr,back[p].url_fil); test_flush; } if (cache->txt) { - if (opt->is_update) - state="untouched"; - else - state="added"; + state="empty"; } } } else { if ( (opt->debug>0) && (opt->log!=NULL) ) { - fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: empty file? (%d, '%s'): %s%s"LF,back[p].r.statuscode,back[p].r.msg,back[p].url_adr,back[p].url_fil); - test_flush; + fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: link error (%d, '%s'): %s%s"LF,back[p].r.statuscode,back[p].r.msg,back[p].url_adr,back[p].url_fil); } if (cache->txt) { - state="empty"; + state="error"; } } - } else { - if ( (opt->debug>0) && (opt->log!=NULL) ) { - fspc(opt->log,"info"); fprintf(opt->log,"engine: transfer-status: link error (%d, '%s'): %s%s"LF,back[p].r.statuscode,back[p].r.msg,back[p].url_adr,back[p].url_fil); - } if (cache->txt) { - state="error"; + fprintf(cache->txt, + "%d\t" + "%s ('%s')\t" + "%s\t" + "%s%s\t" + "%s%s\t%s\t" + "(from %s%s)" + LF, + back[p].r.statuscode, + state, escape_check_url_addr(back[p].r.msg), + escape_check_url_addr(back[p].r.contenttype), + ((back[p].r.etag[0])?"etag:":((back[p].r.lastmodified[0])?"date:":"")), escape_check_url_addr((back[p].r.etag[0])?back[p].r.etag:(back[p].r.lastmodified)), + escape_check_url_addr(back[p].url_adr),escape_check_url_addr(back[p].url_fil),escape_check_url_addr(back[p].url_sav), + escape_check_url_addr(back[p].referer_adr),escape_check_url_addr(back[p].referer_fil) + ); + if (opt->flush) + fflush(cache->txt); } - } - if (cache->txt) { - fprintf(cache->txt, - "%d\t" - "%s ('%s')\t" - "%s\t" - "%s%s\t" - "%s%s\t%s\t" - "(from %s%s)" - LF, - back[p].r.statuscode, - state, escape_check_url_addr(back[p].r.msg), - escape_check_url_addr(back[p].r.contenttype), - ((back[p].r.etag[0])?"etag:":((back[p].r.lastmodified[0])?"date:":"")), escape_check_url_addr((back[p].r.etag[0])?back[p].r.etag:(back[p].r.lastmodified)), - escape_check_url_addr(back[p].url_adr),escape_check_url_addr(back[p].url_fil),escape_check_url_addr(back[p].url_sav), - escape_check_url_addr(back[p].referer_adr),escape_check_url_addr(back[p].referer_fil) - ); - if (opt->flush) - fflush(cache->txt); - } - - /* Cache */ - cache_mayadd(opt,cache,&back[p].r,back[p].url_adr,back[p].url_fil,back[p].url_sav); - - // status finished callback + + /* Cache */ + cache_mayadd(opt,cache,&back[p].r,back[p].url_adr,back[p].url_fil,back[p].url_sav); + + // status finished callback #if HTS_ANALYSTE - hts_htmlcheck_xfrstatus(&back[p]); + hts_htmlcheck_xfrstatus(&back[p]); #endif - return 0; + return 0; + } else { // testmode + if (back[p].r.statuscode / 100 >= 3) { /* Store 3XX, 4XX, 5XX test response codes, but NOT 2XX */ + /* Cache */ + cache_mayadd(opt,cache,&back[p].r,back[p].url_adr,back[p].url_fil,NULL); + } + } + } } return -1; } /* try to keep the connection alive */ -int back_letlive(httrackp* opt, lien_back* back, int p) { +int back_letlive(httrackp* opt, cache_back* cache, lien_back* back, int p) { + int checkerror; htsblk* src = &back[p].r; if (src && !src->is_file && src->soc != INVALID_SOCKET && src->statuscode >= 0 /* no timeout errors & co */ && src->keep_alive_trailers == 0 /* not yet supported (chunk trailers) */ - && !check_sockerror(src->soc) + && ! ( checkerror = check_sockerror(src->soc) ) /*&& !check_sockdata(src->soc)*/ /* no unexpected data */ ) { htsblk tmp; memset(&tmp, 0, sizeof(tmp)); /* clear everything but connection: switch, close, and reswitch */ back_connxfr(src, &tmp); - back_delete(opt, back, p); + back_delete(opt, cache, back, p); //deletehttp(src); back_connxfr(&tmp, src); src->req.flush_garbage=1; /* ignore CRLF garbage */ @@ -392,17 +411,25 @@ void back_connxfr(htsblk* src, htsblk* dst) { } // clear, or leave for keep-alive -int back_maydelete(httrackp* opt,lien_back* back, int p) { +int back_maydelete(httrackp* opt,cache_back* cache,lien_back* back, int p) { if (p>=0) { // on sait jamais.. - if (!opt->nokeepalive + if ( + /* Keep-alive authorized by user */ + !opt->nokeepalive + /* Socket currently is keep-alive! */ && back[p].r.keep_alive + /* Remaining authorized requests */ && back[p].r.keep_alive_max > 1 + /* Known keep-alive start (security) */ && back[p].ka_time_start + /* We're on time */ && time_local() < back[p].ka_time_start + back[p].r.keep_alive_t + /* Connection delay must not exceed keep-alive timeout */ + && ( opt->maxconn <= 0 || ( back[p].r.keep_alive_t > ( 1.0 / opt->maxconn ) ) ) ) { lien_back tmp; strcpybuff(tmp.url_adr, back[p].url_adr); - if (back_letlive(opt, back, p)) { + if (back_letlive(opt, cache, back, p)) { strcpybuff(back[p].url_adr, tmp.url_adr); back[p].status = -103; // alive & waiting if ((opt->debug>1) && (opt->log!=NULL)) { @@ -413,21 +440,37 @@ int back_maydelete(httrackp* opt,lien_back* back, int p) { return 1; } } - back_delete(opt,back, p); + back_delete(opt,cache,back, p); } return 0; } // clear, or leave for keep-alive -void back_maydeletehttp(httrackp* opt, lien_back* back, int back_max, int p) { +void back_maydeletehttp(httrackp* opt, cache_back* cache, lien_back* back, int back_max, int p) { + TStamp lt = 0; if (back[p].r.soc!=INVALID_SOCKET) { int q; - if (!opt->nokeepalive + if ( + back[p].r.soc != INVALID_SOCKET /* security check */ + && back[p].r.statuscode >= 0 /* no timeout errors & co */ + && back[p].r.keep_alive_trailers == 0 /* not yet supported (chunk trailers) */ + /* Socket not in I/O error status */ + && !back[p].r.is_file + && !check_sockerror(back[p].r.soc) + /* Keep-alive authorized by user */ + && !opt->nokeepalive + /* Socket currently is keep-alive! */ && back[p].r.keep_alive + /* Remaining authorized requests */ && back[p].r.keep_alive_max > 1 + /* Known keep-alive start (security) */ && back[p].ka_time_start - && time_local() < back[p].ka_time_start + back[p].r.keep_alive_t - && ( q = back_search(opt, back, back_max) ) >= 0 + /* We're on time */ + && ( lt = time_local() ) < back[p].ka_time_start + back[p].r.keep_alive_t + /* Connection delay must not exceed keep-alive timeout */ + && ( opt->maxconn <= 0 || ( back[p].r.keep_alive_t > ( 1.0 / opt->maxconn ) ) ) + /* Available slot in backing */ + && ( q = back_search(opt, cache, back, back_max) ) >= 0 ) { lien_back tmp; @@ -452,13 +495,13 @@ void back_maydeletehttp(httrackp* opt, lien_back* back, int back_max, int p) { /* attempt to attach a live connection to this slot */ -int back_trylive(httrackp* opt,lien_back* back, int back_max, int p) { +int back_trylive(httrackp* opt,cache_back* cache,lien_back* back, int back_max, int p) { if (p>=0 && back[p].status != -103) { // we never know.. int i = back_searchlive(opt,back, back_max, back[p].url_adr); // search slot if (i >= 0 && i != p) { deletehttp(&back[p].r); // security check back_connxfr(&back[i].r, &back[p].r); // transfer live connection settings from i to p - back_delete(opt,back, i); // delete old slot + back_delete(opt,cache,back, i); // delete old slot back[p].status=100; // ready to connect return 1; // success: will reuse live connection } @@ -483,7 +526,7 @@ int back_searchlive(httrackp* opt, lien_back* back, int back_max, char* search_a return -1; } -int back_search(httrackp* opt,lien_back* back, int back_max) { +int back_search(httrackp* opt,cache_back* cache,lien_back* back, int back_max) { int i; /* try to find an empty place */ @@ -497,7 +540,7 @@ int back_search(httrackp* opt,lien_back* back, int back_max) { for(i = 0 ; i < back_max ; i++ ) { if (back[i].status == -103) { /* close this place */ - back_delete(opt,back, i); + back_delete(opt,cache,back, i); return i; } } @@ -507,18 +550,33 @@ int back_search(httrackp* opt,lien_back* back, int back_max) { } // effacer entrée -int back_delete(httrackp* opt, lien_back* back, int p) { +int back_delete(httrackp* opt, cache_back* cache, lien_back* back, int p) { if (p>=0) { // on sait jamais.. // Vérificateur d'intégrité #if DEBUG_CHECKINT _CHECKINT(&back[p],"Appel back_delete") #endif #if HTS_DEBUG_CLOSESOCK - char info[256]; - sprintf(info,"back_delete: #%d\n",p); - DEBUG_W2(info); + DEBUG_W("back_delete: #%d\n" _ (int) p); #endif - + + // Finalize + if (!back[p].finalized) { + if ( + (back[p].status == 0) // ready + && + (!back[p].testmode) // not test mode + && + (back[p].r.statuscode>0) // not internal error + ) { + if ((opt->debug>1) && (opt->log!=NULL)) { + fspc(opt->log,"debug"); fprintf(opt->log,"File '%s%s' -> %s not yet saved in cache - saving now"LF, back[p].url_adr, back[p].url_fil, back[p].url_sav); test_flush; + } + } + back_finalize(opt, cache, back, p); + } + back[p].finalized = 0; + // Libérer tous les sockets, handles, buffers.. if (back[p].r.soc!=INVALID_SOCKET) { #if HTS_DEBUG_CLOSESOCK @@ -546,6 +604,12 @@ int back_delete(httrackp* opt, lien_back* back, int p) { } // } + // headers + if (back[p].r.headers != NULL) { + freet(back[p].r.headers); + back[p].r.headers = NULL; + } + /* fichier de sortie */ if (back[p].r.out!=NULL) { // fermer fichier sortie fclose(back[p].r.out); @@ -607,7 +671,7 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* // rechercher emplacement back_clean(opt, cache, back, back_max); - if ( ( p = back_search(opt, back, back_max) ) >= 0) { + if ( ( p = back_search(opt, cache, back, back_max) ) >= 0) { back[p].send_too[0]='\0'; // éventuels paramètres supplémentaires à transmettre au serveur // clear r @@ -664,6 +728,25 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* return 0; } + // test "fast header" cache ; that is, tests we did that lead to 3XX/4XX/5XX response codes + if (cache->cached_tests != NULL) { + long int ptr = 0; + if (inthash_read((inthash)cache->cached_tests, concat(adr, fil), (long int*)&ptr)) { // gotcha + if (ptr != 0) { + char* text = (char*) ptr; + char* lf = strchr(text, '\n'); + int code = 0; + if (sscanf(text, "%d", &code) == 1) { // got code + back[p].r.statuscode=code; + if (lf != NULL && *lf != '\0') { // got location ? + strcpybuff(back[p].r.location, lf + 1); + } + return 0; + } + } + } + } + // tester cache if ((strcmp(adr,"file://")) /* pas fichier */ && ( (!test) || (cache->type==1) ) /* cache prioritaire, laisser passer en test! */ @@ -681,7 +764,7 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* #else if (cache->use) { #endif - char buff[HTS_URLMAXSIZE*4]; + char BIGSTK buff[HTS_URLMAXSIZE*4]; #if HTS_FAST_CACHE strcpybuff(buff,adr); strcatbuff(buff,fil); hash_pos_return=inthash_read((inthash)cache->hashtable,buff,(long int*)&hash_pos); @@ -710,7 +793,7 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* /* It is possible that the file has been moved due to changes in build structure */ { - char previous_save[HTS_URLMAXSIZE*2]; + char BIGSTK previous_save[HTS_URLMAXSIZE*2]; previous_save[0] = '\0'; back[p].r = cache_readex(opt, cache, adr, fil, NULL, back[p].location_buffer, previous_save, 0); if (previous_save[0] != '\0' && fexist(fconv(previous_save))) { @@ -861,9 +944,6 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* back[p].r.req.nocompression=1; /* Do not compress when updating! */ } - /* else if (strnotempty(cache->lastmodified)) - sprintf(back[p].send_too,"If-Modified-Since: %s\r\n",cache->lastmodified); - */ } #if DEBUGCA printf("..is modified test %s\n",back[p].send_too); @@ -881,10 +961,11 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* // On demande juste les données restantes si le date est valide (206), tout sinon (200) if ((ishtml(save) != 1) && (ishtml(back[p].url_fil)!=1)) { // NON HTML (liens changés!!) if (sz>0) { // Fichier non vide? (question bête, sinon on transfert tout!) - if (strnotempty(cache->lastmodified)) { /* pas de If-.. possible */ - /*if ( (!opt->http10) && (strnotempty(cache->lastmodified)) ) { */ /* ne pas forcer 1.0 */ + char lastmodified[256]; + get_filetime_rfc822(save, lastmodified); + if (strnotempty(lastmodified)) { /* pas de If-.. possible */ #if DEBUGCA - printf("..if unmodified since %s size "LLintP"\n",cache->lastmodified,(LLint)sz); + printf("..if unmodified since %s size "LLintP"\n", lastmodified, (LLint)sz); #endif if ((opt->debug>1) && (opt->log!=NULL)) { fspc(opt->log,"debug"); fprintf(opt->log,"File partially present ("LLintP" bytes): %s%s"LF,(LLint)sz,back[p].url_adr,back[p].url_fil); test_flush; @@ -899,10 +980,10 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* back[p].http11=1; // En tête 1.1 } else */ - if (strlen(cache->lastmodified)) { + if (strlen(lastmodified)) { sprintf(back[p].send_too, "If-Unmodified-Since: %s\r\nRange: bytes="LLintP"-\r\n" - ,cache->lastmodified,(LLint)sz); + , lastmodified, (LLint)sz); back[p].http11=1; // En tête 1.1 back[p].range_req_size=sz; back[p].r.req.range_used=1; @@ -959,6 +1040,8 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* memcpy(&(back[p].r.req.proxy), &opt->proxy, sizeof(opt->proxy)); // et user-agent strcpybuff(back[p].r.req.user_agent,opt->user_agent); + strcpybuff(back[p].r.req.referer,opt->referer); + strcpybuff(back[p].r.req.from,opt->from); strcpybuff(back[p].r.req.lang_iso,opt->lang_iso); back[p].r.req.user_agent_send=opt->user_agent_send; // et http11 @@ -997,7 +1080,7 @@ int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* } #endif - if (!back_trylive(opt,back, back_max, p)) { + if (!back_trylive(opt, cache, back, back_max, p)) { #if HTS_XGETHOST #if HDEBUG printf("back_solve..\n"); @@ -1117,7 +1200,7 @@ printf("Xfopen ok, poll..\n"); #if HTS_XGETHOST #if USE_BEGINTHREAD // lancement multithread du robot -PTHREAD_TYPE Hostlookup(void* iadr_p) { +PTHREAD_TYPE PTHREAD_TYPE_FNC Hostlookup(void* iadr_p) { char iadr[256]; t_dnscache* cache=_hts_cache(); // adresse du cache t_hostent* hp; @@ -1209,7 +1292,7 @@ void back_solve(lien_back* back) { char* p = calloct(strlen(a)+2,1); if (p) { strcpybuff(p,a); - _beginthread( Hostlookup , 0, p ); + (void)hts_newthread( Hostlookup , 0, p ); } } #else @@ -1221,7 +1304,7 @@ void back_solve(lien_back* back) { char* p = calloct(strlen(a)+2,1); if (p) { strcpybuff(p,a); - _beginthread( Hostlookup , 0, p ); + (void)hts_newthread( Hostlookup , 0, p ); } #else // Sous Unix, le gethostbyname() est bloquant.. @@ -1264,8 +1347,8 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { if (back[i].r.statuscode==200) { // HTTP "OK" if (back[i].r.size>0) { // size>0 if (back[i].r.is_write // not in memory (on disk, ready) - && !is_hypertext_mime(back[i].r.contenttype) // not HTML/hypertext - && !may_be_hypertext_mime(back[i].r.contenttype) // may NOT be parseable mime type + && !is_hypertext_mime(back[i].r.contenttype, back[i].url_fil) // not HTML/hypertext + && !may_be_hypertext_mime(back[i].r.contenttype, back[i].url_fil) // may NOT be parseable mime type ) { if (back[i].pass2_ptr) { // finalize @@ -1279,20 +1362,20 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { //xxxcache_mayadd(opt,cache,&back[i].r,back[i].url_adr,back[i].url_fil,back[i].url_sav); usercommand(opt, 0, NULL, back[i].url_sav, back[i].url_adr, back[i].url_fil); *back[i].pass2_ptr=-1; // Done! - back_maydelete(opt,back,i); // May delete backing entry if ((opt->debug>0) && (opt->log!=NULL)) { fspc(opt->log,"info"); fprintf(opt->log,"File successfully written in background: %s"LF,back[i].url_sav); test_flush; } + back_maydelete(opt,cache,back,i); // May delete backing entry } } else { if (!back[i].finalized) { if (1) { /* Ensure deleted or recycled socket */ /* BUT DO NOT YET WIPE back[i].r.adr */ - back_maydeletehttp(opt, back, back_max, i); if ( (opt->debug>1) && (opt->log!=NULL) ) { fspc(opt->log,"debug"); fprintf(opt->log,"file %s%s validated (cached, left in memory)"LF,back[i].url_adr,back[i].url_fil); test_flush; } + back_maydeletehttp(opt, cache, back, back_max, i); } else { /* NOT YET HANDLED CORRECTLY (READ IN NEW CACHE TO DO) @@ -1302,7 +1385,7 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { htsblk r; /* Ensure deleted or recycled socket */ - back_maydeletehttp(opt, back, back_max, i); + back_maydeletehttp(opt, cache, back, back_max, i); assertf(back[i].r.soc == INVALID_SOCKET); /* Check header */ @@ -1312,7 +1395,6 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { /* Delete buffer and sockets */ deleteaddr(&back[i].r); deletehttp(&back[i].r); - back[i].finalized = 1; if ( (opt->debug>1) && (opt->log!=NULL) ) { fspc(opt->log,"debug"); fprintf(opt->log,"file %s%s temporarily left in cache to spare memory"LF,back[i].url_adr,back[i].url_fil); test_flush; } @@ -1344,7 +1426,7 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { back[i].url_adr); test_flush; } - back_delete(opt,back, i); // delete backing entry + back_delete(opt,cache,back, i); // delete backing entry } } } @@ -1352,7 +1434,7 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { for(i=0;i max ; i++) { if (back[i].status == -103) { - back_delete(opt,back, i); // delete backing entry + back_delete(opt,cache,back, i); // delete backing entry curr--; } } @@ -1379,7 +1461,7 @@ void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max) { // attente (gestion des buffers des sockets) void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TStamp stat_timestart) { - int i; + unsigned int i_mod; T_SOC nfds=INVALID_SOCKET; fd_set fds,fds_c,fds_e; // fds pour lecture, connect (write), et erreur int nsockets; // nbre sockets @@ -1393,7 +1475,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta #if HTS_ANALYSTE int max_loop_chk=0; #endif - + unsigned int mod_random = (unsigned int) ( time_local() + HTS_STAT.HTS_TOTAL_RECV ); // max. number of loops max_loop=8; @@ -1422,7 +1504,9 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta nfds=INVALID_SOCKET; max_c=1; - for(i=0;i0) { if (!back[i].r.is_file) { // not file.. if (back[i].r.soc!=INVALID_SOCKET) { // hey, you never know.. @@ -1561,7 +1646,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta else strcpybuff(back[i].r.msg,"Receive Error"); if (back[i].status == -103) { /* Keep-alive socket */ - back_delete(opt,back, i); + back_delete(opt,cache,back, i); } else { back[i].status=0; // terminé } @@ -1619,9 +1704,9 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if ((back[i].r.soc != INVALID_SOCKET) && (back[i].status==100)) { /* limit nb. connections/seconds to avoid server overload */ - if (opt->maxconn>0) { + /*if (opt->maxconn>0) { Sleep(1000/opt->maxconn); - } + }*/ back[i].ka_time_start=time_local(); if (back[i].timeout>0) { // refresh timeout si besoin est @@ -1754,6 +1839,13 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta } } #endif + else if (back[i].status==1001) { // ftp ready + back[i].status=0; + // finalize transfer + if (back[i].r.statuscode>0) { + back_finalize(opt,cache,back,i); + } + } else if ((back[i].status>0) && (back[i].status<1000)) { // en réception http int dispo=0; @@ -1764,11 +1856,12 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta // données dispo? //## if (back[i].url_adr[0]!=lOCAL_CHAR) - if (!back[i].r.is_file) { - dispo=FD_ISSET(back[i].r.soc,&fds); - } - else + if (back[i].r.is_file) + dispo=1; + else if (back[i].r.ssl) dispo=1; + else + dispo=FD_ISSET(back[i].r.soc,&fds); // Check transfer rate! if (!max_read_bytes) @@ -1795,7 +1888,8 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if (strnotempty(back[i].url_sav)) { if (strcmp(back[i].url_fil,"/robots.txt")) { if (back[i].r.statuscode==200) { // 'OK' - if (!is_hypertext_mime(back[i].r.contenttype)) { // pas HTML + if (!is_hypertext_mime(back[i].r.contenttype, back[i].url_fil) + ) { // pas HTML if (opt->getmode&2) { // on peut ecrire des non html int fcheck=0; back[i].r.is_write=1; // écrire @@ -1900,7 +1994,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta retour_fread=(int) http_xfread1(&(back[i].r),(int) max_read_bytes); // retour_fread=http_fread1(&(back[i].r)); } else - retour_fread=-1; // interruption ou annulation interne (peut ne pas être une erreur) + retour_fread=READ_EOF; // interruption ou annulation interne (peut ne pas être une erreur) // Si réception chunk, tester si on est pas à la fin! if (back[i].status==1) { @@ -1920,27 +2014,25 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta } } else if (back[i].r.keep_alive) { if (back[i].r.size==back[i].r.totalsize) { // fin! - retour_fread=-1; // end + retour_fread=READ_EOF; // end } } } if (retour_fread < 0) { // fin réception back[i].status=0; // terminé - if (back[i].r.soc!=INVALID_SOCKET) { -#if HTS_DEBUG_CLOSESOCK - DEBUG_W("back_wait(4): deletehttp\n"); -#endif - /*KA deletehttp(&back[i].r);*/ - back_maydeletehttp(opt, back, back_max, i); - } - /*KA back[i].r.soc=INVALID_SOCKET; */ + /*KA back[i].r.soc=INVALID_SOCKET; */ #if CHUNKDEBUG==1 if (back[i].is_chunk) printf("[%d] must be the last chunk for %s (connection closed) - %d/%d\n",(int)back[i].r.soc,back[i].url_fil,back[i].r.size,back[i].r.totalsize); #endif - //if ((back[i].r.statuscode==-1) && (strnotempty(back[i].r.msg)==0)) { - if ((back[i].r.statuscode <= 0) && (strnotempty(back[i].r.msg)==0)) { + if (retour_fread < 0 && retour_fread != READ_EOF) { + if (back[i].r.size > 0) + strcatbuff(back[i].r.msg, "Interrupted transfer"); + else + strcatbuff(back[i].r.msg, "No data (connection closed)"); + back[i].r.statuscode=-4; + } else if ((back[i].r.statuscode <= 0) && (strnotempty(back[i].r.msg)==0)) { #if HDEBUG printf("error interruped: %s\n",back[i].r.adr); #endif @@ -1951,6 +2043,15 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta back[i].r.statuscode=-4; } + // Close socket + if (back[i].r.soc!=INVALID_SOCKET) { +#if HTS_DEBUG_CLOSESOCK + DEBUG_W("back_wait(4): deletehttp\n"); +#endif + /*KA deletehttp(&back[i].r);*/ + back_maydeletehttp(opt, cache, back, back_max, i); + } + // finalize transfer if (back[i].r.statuscode>0) { back_finalize(opt,cache,back,i); @@ -2101,16 +2202,16 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta /* Tester totalsize en fin de chunk */ if ((back[i].r.totalsize>0)) { // tester totalsize if (back[i].r.totalsize!=back[i].r.size) { // pas la même! -#if HTS_CL_IS_FATAL - deleteaddr(&back[i].r); - back[i].r.statuscode=-1; - strcpybuff(back[i].r.msg,"Incorrect length"); -#else - // Un warning suffira.. - if (cache->errlog!=NULL) { - fspc(cache->errlog,"warning"); fprintf(cache->errlog,"Incorrect length ("LLintP"!="LLintP" expected) for %s%s"LF,(LLint)back[i].r.size,(LLint)back[i].r.totalsize,back[i].url_adr,back[i].url_fil); + if (!opt->tolerant) { + deleteaddr(&back[i].r); + back[i].r.statuscode=-1; + strcpybuff(back[i].r.msg,"Incorrect length"); + } else { + // Un warning suffira.. + if (cache->errlog!=NULL) { + fspc(cache->errlog,"warning"); fprintf(cache->errlog,"Incorrect length ("LLintP"!="LLintP" expected) for %s%s"LF,(LLint)back[i].r.size,(LLint)back[i].r.totalsize,back[i].url_adr,back[i].url_fil); + } } -#endif } } @@ -2153,22 +2254,6 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta #endif - // Callback -#if HTS_ANALYSTE - if (hts_htmlcheck_receivehead != NULL) { - int test_head=hts_htmlcheck_receivehead(back[i].r.adr, back[i].url_adr, back[i].url_fil, back[i].referer_adr, back[i].referer_fil, &back[i].r); - if (test_head!=1) { - if ((opt->debug>0) && (opt->log!=NULL)) { - fspc(opt->log,"warning"); fprintf(opt->log,"External wrapper aborted transfer, breaking connection: %s%s"LF,back[i].url_adr,back[i].url_fil); test_flush; - } - back[i].status=0; // FINI - deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET; - strcpybuff(back[i].r.msg,"External wrapper aborted transfer"); - back[i].r.statuscode = -1; - } - } -#endif - /* Hack for zero-length headers */ if (back[i].status != 0 && back[i].r.adr[0] != '<') { @@ -2223,15 +2308,36 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta } while(strnotempty(rcvd)); // ---------------------------------------- - // libérer mémoire -- après! -- - deleteaddr(&back[i].r); } else { // assume text/html, OK treatfirstline(&back[i].r, back[i].r.adr); noFreebuff=1; } - + // Callback +#if HTS_ANALYSTE + if (hts_htmlcheck_receivehead != NULL) { + int test_head=hts_htmlcheck_receivehead(back[i].r.adr, back[i].url_adr, back[i].url_fil, back[i].referer_adr, back[i].referer_fil, &back[i].r); + if (test_head!=1) { + if ((opt->debug>0) && (opt->log!=NULL)) { + fspc(opt->log,"warning"); fprintf(opt->log,"External wrapper aborted transfer, breaking connection: %s%s"LF,back[i].url_adr,back[i].url_fil); test_flush; + } + back[i].status=0; // FINI + deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET; + strcpybuff(back[i].r.msg,"External wrapper aborted transfer"); + back[i].r.statuscode = -1; + } + } +#endif + + // Free headers memory now + // Actually, save them for informational purpose + if (!noFreebuff) { + char* block = back[i].r.adr; + back[i].r.adr = NULL; + deleteaddr(&back[i].r); + back[i].r.headers = block; + } /* Status code and header-response hacks @@ -2325,7 +2431,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if (opt->sizehack) { if (!back[i].is_update) { // mise à jour if (back[i].r.statuscode==200 && !back[i].testmode) { // 'OK' - if (!is_hypertext_mime(back[i].r.contenttype)) { // not HTML + if (!is_hypertext_mime(back[i].r.contenttype, back[i].url_fil)) { // not HTML if (strnotempty(back[i].url_sav)) { // target found int size = fsize(back[i].url_sav); // target size if (size >= 0) { @@ -2360,7 +2466,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if (strnotempty(back[i].url_sav)) { if (strcmp(back[i].url_fil,"/robots.txt")) { if (back[i].r.statuscode==200) { // 'OK' - if (!is_hypertext_mime(back[i].r.contenttype)) { // pas HTML + if (!is_hypertext_mime(back[i].r.contenttype, back[i].url_fil)) { // pas HTML if (back[i].r.statuscode==200) { // "OK" if (back[i].range_req_size>0) { // but Range: requested if (back[i].range_req_size == back[i].r.totalsize) { // And same size @@ -2495,7 +2601,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta #endif // Couper connexion /*KA deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET;*/ - back_maydeletehttp(opt, back, back_max, i); + back_maydeletehttp(opt, cache, back, back_max, i); back[i].status=0; // terminé // finalize @@ -2520,7 +2626,7 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta printf("partial content: "LLintP" on disk..\n",(LLint)sz); #endif if (sz>=0) { - if (!is_hypertext_mime(back[i].r.contenttype)) { // pas HTML + if (!is_hypertext_mime(back[i].r.contenttype, back[i].url_sav)) { // pas HTML if (opt->getmode&2) { // on peut ecrire des non html **sinon ben euhh sera intercepté plus loin, donc rap sur ce qui va sortir** filenote(back[i].url_sav,NULL); // noter fichier comme connu back[i].r.out=fopen(fconv(back[i].url_sav),"ab"); // append @@ -2591,9 +2697,10 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if (back[i].status!=0) { // non terminé (erreur) if (!back[i].testmode) { // fichier normal - if (back[i].r.empty && back[i].r.statuscode==200) { // empty response + if (back[i].r.empty /* ?? && back[i].r.statuscode==200 */) { // empty response // Couper connexion - deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET; + back_maydeletehttp(opt, cache, back, back_max, i); + /* KA deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET; */ back[i].status=0; // terminé if ( deleteaddr(&back[i].r) && (back[i].r.adr=(char*) malloct((INTsys) 2)) ) { back[i].r.adr[0] = 0; @@ -2687,7 +2794,9 @@ void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TSta if (gestion_timeout) { TStamp act; act=time_local(); // temps en secondes - for(i=0;i0) { // réception/connexion/.. if (back[i].timeout>0) { //printf("time check %d\n",((int) (act-back[i].timeout_refresh))-back[i].timeout); @@ -2795,7 +2904,7 @@ LLint back_transfered(LLint nb,lien_back* back,int back_max) { // j: 1 afficher sockets 2 afficher autres 3 tout afficher void back_info(lien_back* back,int i,int j,FILE* fp) { if (back[i].status>=0) { - char s[HTS_URLMAXSIZE*2+1024]; + char BIGSTK s[HTS_URLMAXSIZE*2+1024]; s[0]='\0'; back_infostr(back,i,j,s); strcatbuff(s,LF); @@ -2881,7 +2990,7 @@ void back_infostr(lien_back* back,int i,int j,char* s) { if (aff) { { - char s2[HTS_URLMAXSIZE*2+1024]; + char BIGSTK s2[HTS_URLMAXSIZE*2+1024]; sprintf(s2,"\"%s",back[i].url_adr); strcatbuff(s,s2); if (back[i].url_fil[0]!='/') strcatbuff(s,"/"); diff --git a/src/htsback.h b/src/htsback.h index 74fd540..9587d7e 100644 --- a/src/htsback.h +++ b/src/htsback.h @@ -42,6 +42,9 @@ Please visit our Website: http://www.httrack.com #include "htsbasenet.h" #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + // backing #define BACK_ADD_TEST "(dummy)" #define BACK_ADD_TEST2 "(dummy2)" @@ -53,16 +56,16 @@ int back_nsoc(lien_back* back,int back_max); int back_nsoc_overall(lien_back* back,int back_max); int back_add(lien_back* back,int back_max,httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* referer_adr,char* referer_fil,int test,int* pass2_ptr); int back_stack_available(lien_back* back,int back_max); -int back_search(httrackp* opt, lien_back* back, int back_max); +int back_search(httrackp* opt, cache_back* cache, lien_back* back, int back_max); void back_clean(httrackp* opt,cache_back* cache,lien_back* back,int back_max); void back_wait(lien_back* back,int back_max,httrackp* opt,cache_back* cache,TStamp stat_timestart); -int back_letlive(httrackp* opt, lien_back* back, int p); +int back_letlive(httrackp* opt, cache_back* cache, lien_back* back, int p); int back_searchlive(httrackp* opt, lien_back* back, int back_max, char* search_addr); void back_connxfr(htsblk* src, htsblk* dst); -int back_delete(httrackp* opt,lien_back* back,int p); -int back_maydelete(httrackp* opt, lien_back* back, int p); -void back_maydeletehttp(httrackp* opt, lien_back* back, int back_max, int p); -int back_trylive(httrackp* opt,lien_back* back, int back_max, int p); +int back_delete(httrackp* opt,cache_back* cache,lien_back* back,int p); +int back_maydelete(httrackp* opt, cache_back* cache, lien_back* back, int p); +void back_maydeletehttp(httrackp* opt, cache_back* cache, lien_back* back, int back_max, int p); +int back_trylive(httrackp* opt,cache_back* cache,lien_back* back, int back_max, int p); int back_finalize(httrackp* opt,cache_back* cache,lien_back* back,int p); void back_info(lien_back* back,int i,int j,FILE* fp); void back_infostr(lien_back* back,int i,int j,char* s); @@ -77,8 +80,10 @@ int back_checkmirror(httrackp* opt); #if HTS_XGETHOST #if USE_BEGINTHREAD -PTHREAD_TYPE Hostlookup(void* iadr_p); +PTHREAD_TYPE PTHREAD_TYPE_FNC Hostlookup(void* iadr_p); +#endif #endif + #endif #endif diff --git a/src/htsbase.h b/src/htsbase.h index 139e3ed..9911d73 100644 --- a/src/htsbase.h +++ b/src/htsbase.h @@ -44,9 +44,8 @@ extern "C" { #include "htsglobal.h" -// size_t et mode_t -#include -#include +#include +#include #ifdef HAVE_UNISTD_H #include @@ -76,30 +75,24 @@ extern "C" { #define min(a,b) ((a)>(b)?(b):(a)) #define max(a,b) ((a)>(b)?(a):(b)) +#ifndef _WIN32 +#undef Sleep +#define min(a,b) ((a)>(b)?(b):(a)) +#define max(a,b) ((a)>(b)?(a):(b)) +#define Sleep(a) { if (((a)*1000)%1000000) usleep(((a)*1000)%1000000); if (((a)*1000)/1000000) sleep(((a)*1000)/1000000); } +#endif + // teste égalité de 2 chars, case insensitive #define hichar(a) ((((a)>='a') && ((a)<='z')) ? ((a)-('a'-'A')) : (a)) #define streql(a,b) (hichar(a)==hichar(b)) -// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type? -#define HTS_HYPERTEXT_DEFAULT_MIME "text/html" -#define is_hypertext_mime(a) \ - ( (strfield2((a),"text/html")!=0)\ - || (strfield2((a),"application/x-javascript")!=0) \ - || (strfield2((a),"text/css")!=0) \ - /*|| (strfield2((a),"text/vnd.wap.wml")!=0)*/ \ - || (strfield2((a),"image/svg+xml")!=0) \ - || (strfield2((a),"image/svg-xml")!=0) \ - /*|| (strfield2((a),"audio/x-pn-realaudio")!=0) */\ - ) +// caractère maj +#define isUpperLetter(a) ( ((a) >= 'A') && ((a) <= 'Z') ) -#define may_be_hypertext_mime(a) \ - (\ - (strfield2((a),"audio/x-pn-realaudio")!=0) \ - ) +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE -// caractère maj -#define isUpperLetter(a) ( ((a) >= 'A') && ((a) <= 'Z') ) // functions #ifdef _WIN32 @@ -112,10 +105,15 @@ extern "C" { typedef void (*t_abortLog)(char* msg, char* file, int line); extern HTSEXT_API t_abortLog abortLog__; #define abortLog(a) abortLog__(a, __FILE__, __LINE__) +#define _ , +#ifndef _WIN32_WCE #define abortLogFmt(a) do { \ FILE* fp = fopen("CRASH.TXT", "wb"); \ if (!fp) fp = fopen("/tmp/CRASH.TXT", "wb"); \ if (!fp) fp = fopen("C:\\CRASH.TXT", "wb"); \ + if (!fp) fp = fopen("\\Temp\\CRASH.TXT", "wb"); \ + if (!fp) fp = fopen("\\CRASH.TXT", "wb"); \ + if (!fp) fp = fopen("CRASH.TXT", "wb"); \ if (fp) { \ fprintf(fp, "HTTrack " HTTRACK_VERSIONID " closed at '" __FILE__ "', line %d\r\n", __LINE__); \ fprintf(fp, "Reason:\r\n"); \ @@ -125,22 +123,12 @@ extern HTSEXT_API t_abortLog abortLog__; fclose(fp); \ } \ } while(0) - - -#define _ , +#else #define abortLogFmt(a) do { \ - FILE* fp = fopen("CRASH.TXT", "wb"); \ - if (!fp) fp = fopen("/tmp/CRASH.TXT", "wb"); \ - if (!fp) fp = fopen("C:\\CRASH.TXT", "wb"); \ - if (fp) { \ - fprintf(fp, "HTTrack " HTTRACK_VERSIONID " closed at '" __FILE__ "', line %d\r\n", __LINE__); \ - fprintf(fp, "Reason:\r\n"); \ - fprintf(fp, a); \ - fprintf(fp, "\r\n"); \ - fflush(fp); \ - fclose(fp); \ - } \ + XCEShowMessageA("HTTrack " HTTRACK_VERSIONID " closed at '" __FILE__ "', line %d\r\nReason:\r\n%s\r\n", __LINE__, a); \ } while(0) +#endif + #define assertf(exp) do { \ if (! ( exp ) ) { \ abortLog("assert failed: " #exp); \ @@ -167,17 +155,20 @@ extern HTSEXT_API t_abortLog abortLog__; #define malloct(A) malloc(A) #define calloct(A,B) calloc((A), (B)) #define freet(A) do { assertnf((A) != NULL); if ((A) != NULL) { free(A); (A) = NULL; } } while(0) +#define strdupt(A) strdup(A) #define realloct(A,B) ( ((A) != NULL) ? realloc((A), (B)) : malloc(B) ) #define memcpybuff(A, B, N) memcpy((A), (B), (N)) #else /* debug version */ #define malloct(A) hts_malloc(A) #define calloct(A,B) hts_calloc(A,B) +#define strdupt(A) hts_strdup(A) #define freet(A) do { hts_free(A); (A) = NULL; } while(0) #define realloct(A,B) hts_realloc(A,B) void hts_freeall(); void* hts_malloc (size_t); void* hts_calloc(size_t,size_t); +char* hts_strdup(char*); void* hts_xmalloc(size_t,size_t); void hts_free (void*); void* hts_realloc (void*,size_t); @@ -379,9 +370,10 @@ extern HTSEXT_API int htsMemoryFastXfr; #endif +#endif #ifdef __cplusplus - }; +} #endif #endif diff --git a/src/htsbasenet.h b/src/htsbasenet.h index 71ac9c9..f2a6c53 100644 --- a/src/htsbasenet.h +++ b/src/htsbasenet.h @@ -41,15 +41,23 @@ Please visit our Website: http://www.httrack.com #if HTS_WIN #if HTS_INET6==0 - #include + #include #else + +#ifndef _WIN32_WCE #undef HTS_USESCOPEID #define WIN32_LEAN_AND_MEAN #include #include #include +#else + #include + #include +#endif + #endif - typedef SOCKET T_SOC; + +typedef SOCKET T_SOC; typedef struct hostent FAR t_hostent; #else @@ -67,9 +75,6 @@ Please visit our Website: http://www.httrack.com */ #ifndef HTS_OPENSSL_H_INCLUDED #define HTS_OPENSSL_H_INCLUDED -#ifdef __cplusplus -extern "C" { -#endif /* #include @@ -77,6 +82,9 @@ extern "C" { #include */ +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + /* OpenSSL definitions */ #define SSL_shutdown hts_ptrfunc_SSL_shutdown #define SSL_free hts_ptrfunc_SSL_free @@ -96,6 +104,9 @@ extern "C" { #define ERR_error_string hts_ptrfunc_ERR_error_string #define SSL_load_error_strings hts_ptrfunc_SSL_load_error_strings #define SSL_CTX_ctrl hts_ptrfunc_SSL_CTX_ctrl + +#endif + /* */ typedef void SSL_CTX; typedef void* SSL; @@ -118,6 +129,10 @@ typedef SSL_CTX * (*t_SSL_CTX_new)(SSL_METHOD *method); typedef char * (*t_ERR_error_string)(unsigned long e, char *buf); typedef void (*t_SSL_load_error_strings)(void); typedef long (*t_SSL_CTX_ctrl)(SSL_CTX *ctx, int cmd, long larg, char *parg); + +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + extern int SSL_is_available; extern t_SSL_shutdown SSL_shutdown; extern t_SSL_free SSL_free; @@ -137,6 +152,9 @@ extern t_SSL_CTX_new SSL_CTX_new; extern t_ERR_error_string ERR_error_string; extern t_SSL_load_error_strings SSL_load_error_strings; extern t_SSL_CTX_ctrl SSL_CTX_ctrl; + +#endif + /* From /usr/include/openssl/ssl.h */ @@ -154,9 +172,6 @@ From /usr/include/openssl/ssl.h SSL_CTX_ctrl(ctx,SSL_CTRL_OPTIONS,op,NULL) //#include -#ifdef __cplusplus - }; -#endif /* OpenSSL structure */ extern SSL_CTX *openssl_ctx; diff --git a/src/htsbauth.c b/src/htsbauth.c index 23a22af..cdc7f1c 100644 --- a/src/htsbauth.c +++ b/src/htsbauth.c @@ -35,15 +35,14 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE #include "htsbauth.h" /* specific definitions */ #include "htsglobal.h" #include "htslib.h" -#include -#include -#include #include "htsnostatic.h" @@ -171,17 +170,17 @@ char* cookie_nextfield(char* a) { // lire également (Windows seulement) les *@*.txt (cookies IE copiés) // !=0 : erreur int cookie_load(t_cookie* cookie,char* fpath,char* name) { - cookie->data[0]='\0'; + // cookie->data[0]='\0'; // Fusionner d'abord les éventuels cookies IE #if HTS_WIN { - WIN32_FIND_DATA find; + WIN32_FIND_DATAA find; HANDLE h; char pth[MAX_PATH + 32]; strcpybuff(pth,fpath); strcatbuff(pth,"*@*.txt"); - h = FindFirstFile(pth,&find); + h = FindFirstFileA((char*)pth,&find); if (h != INVALID_HANDLE_VALUE) { do { if (!(find.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY )) @@ -191,19 +190,33 @@ int cookie_load(t_cookie* cookie,char* fpath,char* name) { char cook_name[256]; char cook_value[1000]; char domainpathpath[512]; + char dummy[512]; // char domain[256]; // domaine cookie (.netscape.com) char path[256]; // chemin (/) int cookie_merged=0; - linput(fp,cook_name,250); - if (!feof(fp)) { - linput(fp,cook_value,250); - if ( (!feof(fp)) && (strnotempty(cook_value)) ) { - linput(fp,domainpathpath,500); - if (strnotempty(domainpathpath)) { - if (ident_url_absolute(domainpathpath,domain,path)>=0) { - cookie_add(cookie,cook_name,cook_value,domain,path); - cookie_merged=1; + // + // Read all cookies + while( ! feof(fp) ) { + cook_name[0] = cook_value[0] = domainpathpath[0] + = dummy[0] = domain[0] = path[0] = '\0'; + linput(fp,cook_name,250); + if ( ! feof(fp) ) { + linput(fp,cook_value,250); + if ( ! feof(fp) ) { + int i; + linput(fp,domainpathpath,500); + /* Read 6 other useless values */ + for(i = 0 ; ! feof(fp) && i < 6 ; i++) { + linput(fp,dummy,500); + } + if (strnotempty(cook_name) + && strnotempty(cook_value) + && strnotempty(domainpathpath)) { + if (ident_url_absolute(domainpathpath,domain,path)>=0) { + cookie_add(cookie,cook_name,cook_value,domain,path); + cookie_merged=1; + } } } } @@ -213,7 +226,7 @@ int cookie_load(t_cookie* cookie,char* fpath,char* name) { remove(fconcat(fpath,find.cFileName)); } // if fp } - } while(FindNextFile(h,&find)); + } while(FindNextFileA(h,&find)); FindClose(h); } } @@ -223,7 +236,7 @@ int cookie_load(t_cookie* cookie,char* fpath,char* name) { { FILE* fp = fopen(fconcat(fpath,name),"rb"); if (fp) { - char line[8192]; + char BIGSTK line[8192]; while( (!feof(fp)) && (((int) strlen(cookie->data)) < cookie->max_len)) { rawlinput(fp,line,8100); if (strnotempty(line)) { @@ -232,7 +245,7 @@ int cookie_load(t_cookie* cookie,char* fpath,char* name) { char domain[256]; // domaine cookie (.netscape.com) char path[256]; // chemin (/) char cook_name[256]; // nom cookie (MYCOOK) - char cook_value[8192]; // valeur (ID=toto,S=1234) + char BIGSTK cook_value[8192]; // valeur (ID=toto,S=1234) strcpybuff(domain,cookie_get(line,0)); // host strcpybuff(path,cookie_get(line,2)); // path strcpybuff(cook_name,cookie_get(line,5)); // name @@ -256,7 +269,7 @@ int cookie_load(t_cookie* cookie,char* fpath,char* name) { // !=0 : erreur int cookie_save(t_cookie* cookie,char* name) { if (strnotempty(cookie->data)) { - char line[8192]; + char BIGSTK line[8192]; FILE* fp = fopen(fconv(name),"wb"); if (fp) { char* a=cookie->data; diff --git a/src/htsbauth.h b/src/htsbauth.h index d361d83..4066ece 100644 --- a/src/htsbauth.h +++ b/src/htsbauth.h @@ -48,12 +48,16 @@ typedef struct bauth_chain { // buffer pour les cookies et authentification -typedef struct { +typedef struct t_cookie { int max_len; char data[32768]; bauth_chain auth; } t_cookie; + +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + // cookies int cookie_add(t_cookie* cookie,char* cook_name,char* cook_value,char* domain,char* path); int cookie_del(t_cookie* cookie,char* cook_name,char* domain,char* path); @@ -70,5 +74,6 @@ int bauth_add(t_cookie* cookie,char* adr,char* fil,char* auth); char* bauth_check(t_cookie* cookie,char* adr,char* fil); char* bauth_prefix(char* adr,char* fil); +#endif #endif diff --git a/src/htscache.c b/src/htscache.c index b90fa67..aa9a6c8 100644 --- a/src/htscache.c +++ b/src/htscache.c @@ -35,15 +35,19 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htscache.h" /* specific definitions */ #include "htsbase.h" #include "htsbasenet.h" #include "htsmd5.h" -#include -#include -#include +#include + +#include "htszlib.h" + #include "htsnostatic.h" /* END specific definitions */ @@ -116,10 +120,15 @@ void cache_mayadd(httrackp* opt,cache_back* cache,htsblk* r,char* url_adr,char* // ---stockage en cache--- // stocker dans le cache? if (opt->cache) { - if (cache->dat!=NULL) { + if (cache_writable(cache)) { // c'est le seul endroit ou l'on ajoute des elements dans le cache (fichier entier ou header) // on stocke tout fichier "ok", mais également les réponses 404,301,302... - if ((r->statuscode==200) /* stocker réponse standard, plus */ + if ( +#if 1 + r->statuscode > 0 +#else + /* We don't store 5XX errors, because it might be a server problem */ + (r->statuscode==200) /* stocker réponse standard, plus */ || (r->statuscode==204) /* no content */ || (r->statuscode==301) /* moved perm */ || (r->statuscode==302) /* moved temp */ @@ -129,13 +138,33 @@ void cache_mayadd(httrackp* opt,cache_back* cache,htsblk* r,char* url_adr,char* || (r->statuscode==403) /* unauthorized */ || (r->statuscode==404) /* not found */ || (r->statuscode==410) /* gone */ +#endif ) { /* ne pas stocker si la page générée est une erreur */ if (!r->is_file) { // stocker fichiers (et robots.txt) - if ( (strnotempty(url_save)) || (strcmp(url_fil,"/robots.txt")==0)) { + if ( url_save == NULL || (strnotempty(url_save)) || (strcmp(url_fil,"/robots.txt")==0)) { // ajouter le fichier au cache - cache_add(*r,url_adr,url_fil,url_save,cache->ndx,cache->dat,opt->all_in_cache); + cache_add(cache,*r,url_adr,url_fil,url_save,opt->all_in_cache); + // + // store a reference NOT to redo the same test zillions of times! + // (problem reported by Lars Clausen) + // we just store statuscode + location (if any) + if (url_save == NULL && r->statuscode / 100 >= 3) { + // cached "fast" header doesn't uet exists + if (inthash_read((inthash)cache->cached_tests, concat(url_adr, url_fil), NULL) == 0) { + char BIGSTK tempo[HTS_URLMAXSIZE*2]; + sprintf(tempo, "%d", (int)r->statuscode); + if (r->location != NULL && r->location[0] != '\0') { + strcatbuff(tempo, "\n"); + strcatbuff(tempo, r->location); + } + if ((opt->debug>0) && (opt->log!=NULL)) { + fspc(opt->log,"debug"); fprintf(opt->log, "Cached fast-header response: %s%s is %d"LF, url_adr, url_fil, (int)r->statuscode); + } + inthash_add((inthash)cache->cached_tests, concat(url_adr, url_fil), (long int)strdupt(tempo)); + } + } } } } @@ -145,13 +174,222 @@ void cache_mayadd(httrackp* opt,cache_back* cache,htsblk* r,char* url_adr,char* } + +#if 01 + +/* test only - to be removed */ + +#define ZIP_FIELD_STRING(headers, headersSize, field, value) do { \ + if ( (value != NULL) && (value)[0] != '\0') { \ + sprintf(headers + headersSize, "%s: %s\r\n", field, (value != NULL) ? (value) : ""); \ + (headersSize) += (int) strlen(headers + headersSize); \ + } \ +} while(0) +#define ZIP_FIELD_INT(headers, headersSize, field, value) do { \ + if ( (value != 0) ) { \ + sprintf(headers + headersSize, "%s: "LLintP"\r\n", field, (LLint)(value)); \ + (headersSize) += (int) strlen(headers + headersSize); \ + } \ +} while(0) +#define ZIP_FIELD_INT_FORCE(headers, headersSize, field, value) do { \ + sprintf(headers + headersSize, "%s: "LLintP"\r\n", field, (LLint)(value)); \ + (headersSize) += (int) strlen(headers + headersSize); \ +} while(0) + +struct cache_back_zip_entry { + unsigned long int hdrPos; + unsigned long int size; + int compressionMethod; +}; + +#define ZIP_READFIELD_STRING(line, value, refline, refvalue) do { \ + if (line[0] != '\0' && strfield2(line, refline)) { \ + strcpybuff(refvalue, value); \ + line[0] = '\0'; \ + } \ +} while(0) +#define ZIP_READFIELD_INT(line, value, refline, refvalue) do { \ + if (line[0] != '\0' && strfield2(line, refline)) { \ + int intval = 0; \ + sscanf(value, "%d", &intval); \ + (refvalue) = intval; \ + line[0] = '\0'; \ + } \ +} while(0) + + +/* Ajout d'un fichier en cache */ +void cache_add(cache_back* cache,htsblk r,char* url_adr,char* url_fil,char* url_save,int all_in_cache) { + char BIGSTK filemame[HTS_URLMAXSIZE*4]; + int dataincache=0; // put data in cache ? + char BIGSTK headers[8192]; + int headersSize = 0; + int entryBodySize = 0; + int entryFilenameSize = 0; + zip_fileinfo fi; + + // robots.txt hack + if (url_save == NULL) { + dataincache=0; // testing links + } + else { + if ( (strnotempty(url_save)==0) ) { + if (strcmp(url_fil,"/robots.txt")==0) // robots.txt + dataincache=1; + else + return; // error (except robots.txt) + } + + /* Data in cache ? */ + if (is_hypertext_mime(r.contenttype, url_fil)) + dataincache=1; + else if (all_in_cache) + dataincache=1; + } + + if (r.size < 0) // error + return; + + // data in cache + if (dataincache) { + assertf(((int) r.size) == r.size); + entryBodySize = (int) r.size; + } + + /* Fields */ + headers[0] = '\0'; + headersSize = 0; + /* */ + { + char* message; + if (strlen(r.msg) < 32) { + message = r.msg; + } else { + message = "(See X-StatusMessage)"; + } + /* 64 characters MAX for first line */ + sprintf(headers + headersSize, "HTTP/1.%c %d %s\r\n", '1', r.statuscode, r.msg); + } + headersSize += (int) strlen(headers + headersSize); + /* Second line MUST ALWAYS be X-In-Cache */ + ZIP_FIELD_INT_FORCE(headers, headersSize, "X-In-Cache", dataincache); + ZIP_FIELD_INT(headers, headersSize, "X-StatusCode", r.statuscode); + ZIP_FIELD_STRING(headers, headersSize, "X-StatusMessage", r.msg); + ZIP_FIELD_INT(headers, headersSize, "X-Size", r.size); // size + ZIP_FIELD_STRING(headers, headersSize, "Content-Type", r.contenttype); // contenttype + ZIP_FIELD_STRING(headers, headersSize, "X-Charset", r.charset); // contenttype + ZIP_FIELD_STRING(headers, headersSize, "Last-Modified", r.lastmodified); // last-modified + ZIP_FIELD_STRING(headers, headersSize, "Etag", r.etag); // Etag + ZIP_FIELD_STRING(headers, headersSize, "Location", r.location); // 'location' pour moved + ZIP_FIELD_STRING(headers, headersSize, "Content-Disposition", r.cdispo); // Content-disposition + ZIP_FIELD_STRING(headers, headersSize, "X-Addr", url_adr); // Original address + ZIP_FIELD_STRING(headers, headersSize, "X-Fil", url_fil); // Original URI filename + ZIP_FIELD_STRING(headers, headersSize, "X-Save", url_save); // Original save filename + + entryFilenameSize = (int) ( strlen(url_adr) + strlen(url_fil)); + + /* Filename */ + if (!link_has_authority(url_adr)) { + strcpybuff(filemame, "http://"); + } else { + strcpybuff(filemame, ""); + } + strcatbuff(filemame, url_adr); + strcatbuff(filemame, url_fil); + + /* Time */ + memset(&fi, 0, sizeof(fi)); + if (r.lastmodified[0] != '\0') { + struct tm* tm_s=convert_time_rfc822(r.lastmodified); + if (tm_s) { + fi.tmz_date.tm_sec = (uInt) tm_s->tm_sec; + fi.tmz_date.tm_min = (uInt) tm_s->tm_min; + fi.tmz_date.tm_hour = (uInt) tm_s->tm_hour; + fi.tmz_date.tm_mday = (uInt) tm_s->tm_mday; + fi.tmz_date.tm_mon = (uInt) tm_s->tm_mon; + fi.tmz_date.tm_year = (uInt) tm_s->tm_year; + } + } + + /* Open file - NOTE: headers in "comment" */ + if (zipOpenNewFileInZip((zipFile) cache->zipOutput, + filemame, + &fi, + /* + Store headers in realtime in the local file directory as extra field + In case of crash, we'll be able to recover the whole ZIP file by rescanning it + */ + headers, + (uInt) strlen(headers), + NULL, + 0, + NULL, /* comment */ + Z_DEFLATED, + Z_DEFAULT_COMPRESSION) != Z_OK) + { + int zip_disk_write_failed = 0; + assertf(zip_disk_write_failed); + } + + /* Write data in cache */ + if (dataincache) { + if (r.is_write == 0) { + if (r.size > 0 && r.adr != NULL) { + if (zipWriteInFileInZip((zipFile) cache->zipOutput, r.adr, (int) r.size) != Z_OK) { + int zip_disk_write_failed = 0; + assertf(zip_disk_write_failed); + } + } + } else { + FILE* fp; + // On recopie le fichier.. + LLint file_size=fsize(fconv(url_save)); + if (file_size>=0) { + fp=fopen(fconv(url_save),"rb"); + if (fp!=NULL) { + char BIGSTK buff[32768]; + INTsys nl; + do { + nl=fread(buff,1,32768,fp); + if (nl>0) { + if (zipWriteInFileInZip((zipFile) cache->zipOutput, buff, (int) nl) != Z_OK) { + int zip_disk_write_failed = 0; + assertf(zip_disk_write_failed); + } + } + } while(nl>0); + fclose(fp); + } else { + /* Err FIXME - lost file */ + } + } /* Empty files are OK */ + } + } + + /* Close */ + if (zipCloseFileInZip((zipFile) cache->zipOutput) != Z_OK) { + int zip_disk_write_failed = 0; + assertf(zip_disk_write_failed); + } + + /* Flush */ + if (zipFlush((zipFile) cache->zipOutput) != 0) { + int zip_disk_write_failed = 0; + assertf(zip_disk_write_failed); + } +} + +#else + /* Ajout d'un fichier en cache */ -void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_ndx,FILE* cache_dat,int all_in_cache) { +void cache_add(cache_back* cache,htsblk r,char* url_adr,char* url_fil,char* url_save,int all_in_cache) { int pos; char s[256]; - char buff[HTS_URLMAXSIZE*4]; + char BIGSTK buff[HTS_URLMAXSIZE*4]; int ok=1; int dataincache=0; // donnée en cache? + FILE* cache_ndx = cache->ndx; + FILE* cache_dat = cache->dat; /*char digest[32+2];*/ /*digest[0]='\0';*/ @@ -159,6 +397,8 @@ void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_n if ( (strnotempty(url_save)==0) ) { if (strcmp(url_fil,"/robots.txt")==0) // robots.txt dataincache=1; + else if (strcmp(url_fil,"/test")==0) // testing links + dataincache=0; else return; // erreur (sauf robots.txt) } @@ -167,7 +407,7 @@ void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_n return; // refusé.. // Mettre les *donées* en cache ? - if (is_hypertext_mime(r.contenttype)) // html, mise en cache des données et + if (is_hypertext_mime(r.contenttype, url_fil)) // html, mise en cache des données et dataincache=1; // pas uniquement de l'en tête else if (all_in_cache) dataincache=1; // forcer tout en cache @@ -209,6 +449,7 @@ void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_n && cache_wstr(cache_dat,url_adr) != -1 // Original address && cache_wstr(cache_dat,url_fil) != -1 // Original URI filename && cache_wstr(cache_dat,url_save) != -1 // Original save filename + && cache_wstr(cache_dat,r.headers) != -1 // Full HTTP Headers && cache_wstr(cache_dat,"HTS") != -1 // end of header ) { ok=1; /* ok */ @@ -238,7 +479,7 @@ void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_n if (cache_wLLint(cache_dat,file_size)!=-1) { fp=fopen(fconv(url_save),"rb"); if (fp!=NULL) { - char buff[32768]; + char BIGSTK buff[32768]; INTsys nl; do { nl=fread(buff,1,32768,fp); @@ -275,6 +516,8 @@ void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_n fflush(cache_dat); fflush(cache_ndx); } +#endif + htsblk cache_read(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location) { return cache_readex(opt,cache,adr,fil,save,location,NULL,0); @@ -284,19 +527,274 @@ htsblk cache_read_ro(httrackp* opt,cache_back* cache,char* adr,char* fil,char* s return cache_readex(opt,cache,adr,fil,save,location,NULL,1); } +static htsblk cache_readex_old(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location, + char* return_save, int readonly); + +static htsblk cache_readex_new(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location, + char* return_save, int readonly); + // lecture d'un fichier dans le cache // si save==null alors test unqiquement htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location, char* return_save, int readonly) { + if (cache->zipInput != NULL) { + return cache_readex_new(opt, cache, adr, fil, save, location, return_save, readonly); + } else { + return cache_readex_old(opt, cache, adr, fil, save, location, return_save, readonly); + } +} + +// lecture d'un fichier dans le cache +// si save==null alors test unqiquement +static htsblk cache_readex_new(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location, + char* return_save, int readonly) { + char BIGSTK location_default[HTS_URLMAXSIZE*2]; + char BIGSTK buff[HTS_URLMAXSIZE*2]; + char BIGSTK previous_save[HTS_URLMAXSIZE*2]; + long int hash_pos; + int hash_pos_return; + htsblk r; + memset(&r, 0, sizeof(htsblk)); r.soc=INVALID_SOCKET; + + if (location) { + r.location = location; + } else { + r.location = location_default; + } + strcpybuff(r.location, ""); + strcpybuff(buff, adr); + strcatbuff(buff,fil); + hash_pos_return = inthash_read((inthash)cache->hashtable, buff, (long int*)&hash_pos); + /* avoid errors on data entries */ + if (adr[0] == '/' && adr[1] == '/' && adr[2] == '[') { +#if HTS_FAST_CACHE + hash_pos_return = 0; +#else + a = NULL; +#endif + } + + if (hash_pos_return) { + uLong posInZip; + if (hash_pos > 0) { + posInZip = (uLong) hash_pos; + } else { + posInZip = (uLong) -hash_pos; + } + if (unzSetOffset((unzFile) cache->zipInput, posInZip) == Z_OK) { + /* Read header (Max 8KiB) */ + if (unzOpenCurrentFile((unzFile) cache->zipInput) == Z_OK) { + char BIGSTK headerBuff[8192 + 2]; + int readSizeHeader; + int totalHeader = 0; + int dataincache = 0; + + /* For BIG comments */ + headerBuff[0] + = headerBuff[sizeof(headerBuff) - 1] + = headerBuff[sizeof(headerBuff) - 2] + = headerBuff[sizeof(headerBuff) - 3] = '\0'; + + if ( (readSizeHeader = unzGetLocalExtrafield((unzFile) cache->zipInput, headerBuff, sizeof(headerBuff) - 2)) > 0) + /*if (unzGetCurrentFileInfo((unzFile) cache->zipInput, NULL, + NULL, 0, NULL, 0, headerBuff, sizeof(headerBuff) - 2) == Z_OK ) */ + { + int offset = 0; + char BIGSTK line[HTS_URLMAXSIZE + 2]; + int lineEof = 0; + /*readSizeHeader = (int) strlen(headerBuff);*/ + headerBuff[readSizeHeader] = '\0'; + do { + char* value; + line[0] = '\0'; + offset += binput(headerBuff + offset, line, sizeof(line) - 2); + if (line[0] == '\0') { + lineEof = 1; + } + value = strchr(line, ':'); + if (value != NULL) { + *value++ = '\0'; + if (*value == ' ' || *value == '\t') value++; + ZIP_READFIELD_INT(line, value, "X-In-Cache", dataincache); + ZIP_READFIELD_INT(line, value, "X-Statuscode", r.statuscode); + ZIP_READFIELD_STRING(line, value, "X-StatusMessage", r.msg); // msg + ZIP_READFIELD_INT(line, value, "X-Size", r.size); // size + ZIP_READFIELD_STRING(line, value, "Content-Type", r.contenttype); // contenttype + ZIP_READFIELD_STRING(line, value, "X-Charset", r.charset); // contenttype + ZIP_READFIELD_STRING(line, value, "Last-Modified", r.lastmodified); // last-modified + ZIP_READFIELD_STRING(line, value, "Etag", r.etag); // Etag + ZIP_READFIELD_STRING(line, value, "Location", r.location); // 'location' pour moved + ZIP_READFIELD_STRING(line, value, "Content-Disposition", r.cdispo); // Content-disposition + ZIP_READFIELD_STRING(line, value, "X-Addr", previous_save); // Original address + ZIP_READFIELD_STRING(line, value, "X-Fil", previous_save); // Original URI filename + ZIP_READFIELD_STRING(line, value, "X-Save", previous_save); // Original save filename + } + } while(offset < readSizeHeader && !lineEof); + totalHeader = offset; + + /* Complete fields */ + r.totalsize=r.size; + r.adr=NULL; + r.out=NULL; + r.fp=NULL; + + if (save != NULL) { /* ne pas lire uniquement header */ + int ok = 0; + +#if HTS_DIRECTDISK + // Court-circuit: + // Peut-on stocker le fichier directement sur disque? + if (ok) { + if (r.msg[0] == '\0') { + strcpybuff(r.msg,"Cache Read Error : Unexpected error"); + } + } + else if (!readonly && r.statuscode==200 && !is_hypertext_mime(r.contenttype, fil) && strnotempty(save)) { // pas HTML, écrire sur disk directement + + r.is_write=1; // écrire + if (fexist(fconv(save))) { // un fichier existe déja + //if (fsize(fconv(save))==r.size) { // même taille -- NON tant pis (taille mal declaree) + ok=1; // plus rien à faire + filenote(save,NULL); // noter comme connu + } + + if (!dataincache && !ok) { // Pas de donnée en cache et fichier introuvable : erreur! + if (opt->norecatch) { + filecreateempty(save); + // + r.statuscode=-1; + strcpybuff(r.msg,"File deleted by user not recaught"); + ok=1; // ne pas récupérer (et pas d'erreur) + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Previous cache file not found"); + ok=1; // ne pas récupérer + } + } + + if (!ok) { + r.out=filecreate(save); +#if HDEBUG + printf("direct-disk: %s\n",save); +#endif + if (r.out!=NULL) { + char BIGSTK buff[32768+4]; + LLint size = r.size; + if (size > 0) { + INTsys nl; + do { + nl = unzReadCurrentFile((unzFile) cache->zipInput, buff, (int)minimum(size, 32768)); + if (nl>0) { + size-=nl; + if ((INTsys)fwrite(buff,1,(INTsys)nl,r.out)!=nl) { // erreur + r.statuscode=-1; + strcpybuff(r.msg,"Cache Read Error : Read To Disk"); + } + } + } while((nl>0) && (size>0) && (r.statuscode!=-1)); + } + + fclose(r.out); + r.out=NULL; +#if HTS_WIN==0 + chmod(save,HTS_ACCESS_FILE); +#endif + //xxusercommand(opt,0,NULL,fconv(save), adr, fil); + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Cache Write Error : Unable to Create File"); + //printf("%s\n",save); + } + } + + } else +#endif + { // lire en mémoire + + if (!dataincache) { + if (strnotempty(save)) { // Pas de donnée en cache, bizarre car html!!! + r.statuscode=-1; + strcpybuff(r.msg,"Previous cache file not found (2)"); + } else { /* Read in memory from cache */ + if (strnotempty(return_save) && fexist(return_save)) { + FILE* fp = fopen(fconv(return_save), "rb"); + if (fp != NULL) { + r.adr=(char*) malloct((INTsys)r.size + 4); + if (adr != NULL) { + if (r.size > 0 && fread(r.adr, 1, (INTsys) r.size, fp) != r.size) { + r.statuscode=-1; + strcpybuff(r.msg,"Read error in cache disk data"); + } + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Read error (memory exhausted) from cache"); + } + fclose(fp); + } + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Cache file not found on disk"); + } + } + } else { + // lire fichier (d'un coup) + r.adr=(char*) malloct((INTsys)r.size+4); + if (r.adr!=NULL) { + if (unzReadCurrentFile((unzFile) cache->zipInput, r.adr, (INTsys)r.size) != r.size) { // erreur + freet(r.adr); + r.adr=NULL; + r.statuscode=-1; + strcpybuff(r.msg,"Cache Read Error : Read Data"); + } else + *(r.adr+r.size)='\0'; + //printf(">%s status %d\n",back[p].r.contenttype,back[p].r.statuscode); + } else { // erreur + r.statuscode=-1; + strcpybuff(r.msg,"Cache Memory Error"); + } + } + } + } // si save==null, ne rien charger (juste en tête) + + + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Cache Read Error : Read Header Data"); + } + unzCloseCurrentFile((unzFile) cache->zipInput); + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Cache Read Error : Open File"); + } + + } else { + r.statuscode=-1; + strcpybuff(r.msg,"Cache Read Error : Bad Offset"); + } + } else { + r.statuscode=-1; + strcpybuff(r.msg,"File Cache Entry Not Found"); + } + if (!location) { /* don't export internal buffer */ + r.location = NULL; + } + return r; +} + + +// lecture d'un fichier dans le cache +// si save==null alors test unqiquement +static htsblk cache_readex_old(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location, + char* return_save, int readonly) { #if HTS_FAST_CACHE long int hash_pos; int hash_pos_return; #else char* a; #endif - char buff[HTS_URLMAXSIZE*2]; - char location_default[HTS_URLMAXSIZE*2]; - char previous_save[HTS_URLMAXSIZE*2]; + char BIGSTK buff[HTS_URLMAXSIZE*2]; + char BIGSTK location_default[HTS_URLMAXSIZE*2]; + char BIGSTK previous_save[HTS_URLMAXSIZE*2]; htsblk r; int ok=0; int header_only=0; @@ -388,6 +886,9 @@ htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* sa strcpybuff(return_save, previous_save); } } + if (cache->version >= 5) { + r.headers = cache_rstr_addr(cache->olddat); + } // cache_rstr(cache->olddat,check); if (strcmp(check,"HTS")==0) { /* intégrité OK */ @@ -425,7 +926,7 @@ htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* sa #if HTS_DIRECTDISK // Court-circuit: // Peut-on stocker le fichier directement sur disque? - if (!readonly && r.statuscode==200 && !is_hypertext_mime(r.contenttype) && strnotempty(save)) { // pas HTML, écrire sur disk directement + if (!readonly && r.statuscode==200 && !is_hypertext_mime(r.contenttype, fil) && strnotempty(save)) { // pas HTML, écrire sur disk directement int ok=0; r.is_write=1; // écrire @@ -457,7 +958,7 @@ htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* sa printf("direct-disk: %s\n",save); #endif if (r.out!=NULL) { - char buff[32768+4]; + char BIGSTK buff[32768+4]; LLint size = r.size; if (size > 0) { INTsys nl; @@ -572,7 +1073,7 @@ htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* sa /* 0 if failed */ int cache_writedata(FILE* cache_ndx,FILE* cache_dat,char* str1,char* str2,char* outbuff,int len) { if (cache_dat) { - char buff[HTS_URLMAXSIZE*4]; + char BIGSTK buff[HTS_URLMAXSIZE*4]; char s[256]; int pos; fflush(cache_dat); fflush(cache_ndx); @@ -599,7 +1100,7 @@ int cache_writedata(FILE* cache_ndx,FILE* cache_dat,char* str1,char* str2,char* int cache_readdata(cache_back* cache,char* str1,char* str2,char** inbuff,int* inlen) { #if HTS_FAST_CACHE if (cache->hashtable) { - char buff[HTS_URLMAXSIZE*4]; + char BIGSTK buff[HTS_URLMAXSIZE*4]; long int pos; strcpybuff(buff,str1); strcatbuff(buff,str2); if (inthash_read((inthash)cache->hashtable,buff,(long int*)&pos)) { @@ -651,7 +1152,29 @@ void cache_init(cache_back* cache,httrackp* opt) { #else mkdir(fconcat(opt->path_log,"hts-cache"),HTS_PROTECT_FOLDER); #endif - if ((fexist(fconcat(opt->path_log,"hts-cache/new.dat"))) && (fexist(fconcat(opt->path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer + if ((fexist(fconcat(opt->path_log,"hts-cache/new.zip")))) { // il existe déja un cache précédent.. renommer + /* Previous cache from the previous cache version */ +#if 0 + /* No.. reuse with old httrack releases! */ + if (fexist(fconcat(opt->path_log,"hts-cache/old.dat"))) + remove(fconcat(opt->path_log,"hts-cache/old.dat")); + if (fexist(fconcat(opt->path_log,"hts-cache/old.ndx"))) + remove(fconcat(opt->path_log,"hts-cache/old.ndx")); +#endif + /* Previous cache version */ + if ((fexist(fconcat(opt->path_log,"hts-cache/new.dat"))) && (fexist(fconcat(opt->path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer + rename(fconcat(opt->path_log,"hts-cache/new.dat"),fconcat(opt->path_log,"hts-cache/old.dat")); + rename(fconcat(opt->path_log,"hts-cache/new.ndx"),fconcat(opt->path_log,"hts-cache/old.ndx")); + } + + /* Remove OLD cache */ + if (fexist(fconcat(opt->path_log,"hts-cache/old.zip"))) + remove(fconcat(opt->path_log,"hts-cache/old.zip")); + + /* Rename */ + rename(fconcat(opt->path_log,"hts-cache/new.zip"),fconcat(opt->path_log,"hts-cache/old.zip")); + } + else if ((fexist(fconcat(opt->path_log,"hts-cache/new.dat"))) && (fexist(fconcat(opt->path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer #if DEBUGCA printf("work with former cache\n"); #endif @@ -675,6 +1198,136 @@ void cache_init(cache_back* cache,httrackp* opt) { // charger index cache précédent if ( + ( + !cache->ro && + fsize(fconcat(opt->path_log,"hts-cache/old.zip")) > 0 + ) + || + ( + cache->ro && + fsize(fconcat(opt->path_log,"hts-cache/new.zip")) > 0 + ) + ) + { + if (!cache->ro) { + cache->zipInput = unzOpen(fconcat(opt->path_log,"hts-cache/old.zip")); + } else { + cache->zipInput = unzOpen(fconcat(opt->path_log,"hts-cache/new.zip")); + } + + // Corrupted ZIP file ? Try to repair! + if (cache->zipInput == NULL && !cache->ro) { + char* name; + uLong repaired = 0; + uLong repairedBytes = 0; + if (!cache->ro) { + name = fconcat(opt->path_log,"hts-cache/old.zip"); + } else { + name = fconcat(opt->path_log,"hts-cache/new.zip"); + } + if (opt->log) { + fspc(opt->log,"warning"); fprintf(opt->log,"Cache: damaged cache, trying to repair"LF); + fflush(opt->log); + } + if (unzRepair(name, + fconcat(opt->path_log,"hts-cache/repair.zip"), + fconcat(opt->path_log,"hts-cache/repair.tmp"), + &repaired, &repairedBytes + ) == Z_OK) { + unlink(name); + rename(fconcat(opt->path_log,"hts-cache/repair.zip"), name); + cache->zipInput = unzOpen(name); + if (opt->log) { + fspc(opt->log,"warning"); fprintf(opt->log,"Cache: %d bytes successfully recovered in %d entries"LF, + (int) repairedBytes, (int) repaired); + fflush(opt->log); + } + } else { + if (opt->log) { + fspc(opt->log,"warning"); fprintf(opt->log,"Cache: could not repair the cache"LF); + fflush(opt->log); + } + } + } + + // Opened ? + if (cache->zipInput!=NULL) { + + /* Ready directory entries */ + if (unzGoToFirstFile((unzFile) cache->zipInput) == Z_OK) { + char comment[128]; + char BIGSTK filename[HTS_URLMAXSIZE * 4]; + int entries = 0; + memset(comment, 0, sizeof(comment)); // for truncated reads + do { + int readSizeHeader = 0; + filename[0] = '\0'; + comment[0] = '\0'; + if (unzOpenCurrentFile((unzFile) cache->zipInput) == Z_OK) { + if ( + (readSizeHeader = unzGetLocalExtrafield((unzFile) cache->zipInput, comment, sizeof(comment) - 2)) > 0 + && + unzGetCurrentFileInfo((unzFile) cache->zipInput, NULL, filename, sizeof(filename) - 2, NULL, 0, NULL, 0) == Z_OK + ) + { + long int pos = (long int) unzGetOffset((unzFile) cache->zipInput); + assertf(readSizeHeader < sizeof(comment)); + comment[readSizeHeader] = '\0'; + entries++; + if (pos > 0) { + int dataincache = 0; // data in cache ? + char* filenameIndex = filename; + if (strfield(filenameIndex, "http://")) { + filenameIndex += 7; + } + if (comment[0] != '\0') { + int maxLine = 2; + char* a = comment; + while(*a && maxLine-- > 0) { // parse only few first lines + char BIGSTK line[1024]; + line[0] = '\0'; + a+=binput(a, line, sizeof(line) - 2); + if (strfield(line, "X-In-Cache:")) { + if (strfield2(line, "X-In-Cache: 1")) { + dataincache = 1; + } else { + dataincache = 0; + } + break; + } + } + } + if (dataincache) + inthash_add((inthash)cache->hashtable, filenameIndex, pos); + else + inthash_add((inthash)cache->hashtable, filenameIndex, -pos); + } else { + if (opt->log!=NULL) { + fspc(opt->log,"warning"); fprintf(opt->log,"Corrupted cache meta entry #%d"LF, (int)entries); + } + } + } else { + if (opt->log!=NULL) { + fspc(opt->log,"warning"); fprintf(opt->log,"Corrupted cache entry #%d"LF, (int)entries); + } + } + unzCloseCurrentFile((unzFile) cache->zipInput); + } else { + if (opt->log!=NULL) { + fspc(opt->log,"warning"); fprintf(opt->log,"Corrupted cache entry #%d"LF, (int)entries); + } + } + } while( unzGoToNextFile((unzFile) cache->zipInput) == Z_OK ); + if ((opt->debug>0) && (opt->log!=NULL)) { + fspc(opt->log,"debug"); fprintf(opt->log,"Cache index loaded: %d entries loaded"LF, (int)entries); + } + opt->is_update=1; // signaler comme update + + } + + } + + } else if ( ( !cache->ro && fsize(fconcat(opt->path_log,"hts-cache/old.dat")) >=0 && fsize(fconcat(opt->path_log,"hts-cache/old.ndx")) >0 @@ -724,7 +1377,7 @@ void cache_init(cache_back* cache,httrackp* opt) { if (strncmp(firstline,"CACHE-",6)==0) { // Nouvelle version du cache if (strncmp(firstline,"CACHE-1.",8)==0) { // Version 1.1x cache->version=(int)(firstline[8]-'0'); // cache 1.x - if (cache->version <= 4) { + if (cache->version <= 5) { a+=cache_brstr(a,firstline); strcpybuff(cache->lastmodified,firstline); } else { @@ -762,7 +1415,7 @@ void cache_init(cache_back* cache,httrackp* opt) { /* Create hash table for the cache (MUCH FASTER!) */ #if HTS_FAST_CACHE if (cache->use) { - char line[HTS_URLMAXSIZE*2]; + char BIGSTK line[HTS_URLMAXSIZE*2]; char linepos[256]; int pos; while ( (a!=NULL) && (a < (cache->use+buffl) ) ) { @@ -793,60 +1446,96 @@ void cache_init(cache_back* cache,httrackp* opt) { if (!cache->ro) { // ouvrir caches actuels structcheck(fconcat(opt->path_log, "hts-cache/")); - cache->dat=fopen(fconcat(opt->path_log,"hts-cache/new.dat"),"wb"); - cache->ndx=fopen(fconcat(opt->path_log,"hts-cache/new.ndx"),"wb"); - // les deux doivent être ouvrables - if ((cache->dat==NULL) && (cache->ndx!=NULL)) { - fclose(cache->ndx); - cache->ndx=NULL; - } - if ((cache->dat!=NULL) && (cache->ndx==NULL)) { - fclose(cache->dat); - cache->dat=NULL; - } - if (cache->ndx!=NULL) { - char s[256]; - - cache_wstr(cache->dat,"CACHE-1.4"); - fflush(cache->dat); - cache_wstr(cache->ndx,"CACHE-1.4"); - fflush(cache->ndx); - // - time_gmt_rfc822(s); // date et heure actuelle GMT pour If-Modified-Since.. - cache_wstr(cache->ndx,s); - fflush(cache->ndx); // un petit fflush au cas où - - // supprimer old.lst - if (fexist(fconcat(opt->path_log,"hts-cache/old.lst"))) - remove(fconcat(opt->path_log,"hts-cache/old.lst")); - // renommer - if (fexist(fconcat(opt->path_log,"hts-cache/new.lst"))) - rename(fconcat(opt->path_log,"hts-cache/new.lst"),fconcat(opt->path_log,"hts-cache/old.lst")); - // ouvrir - cache->lst=fopen(fconcat(opt->path_log,"hts-cache/new.lst"),"wb"); - { - filecreate_params tmp; - strcpybuff(tmp.path,opt->path_html); // chemin - tmp.lst=cache->lst; // fichier lst - filenote("",&tmp); // initialiser filecreate + if (1) { + /* Create ZIP file cache */ + cache->zipOutput = (void*) zipOpen(fconcat(opt->path_log,"hts-cache/new.zip"), 0); + + if (cache->zipOutput != NULL) { + // supprimer old.lst + if (fexist(fconcat(opt->path_log,"hts-cache/old.lst"))) + remove(fconcat(opt->path_log,"hts-cache/old.lst")); + // renommer + if (fexist(fconcat(opt->path_log,"hts-cache/new.lst"))) + rename(fconcat(opt->path_log,"hts-cache/new.lst"),fconcat(opt->path_log,"hts-cache/old.lst")); + // ouvrir + cache->lst=fopen(fconcat(opt->path_log,"hts-cache/new.lst"),"wb"); + { + filecreate_params tmp; + strcpybuff(tmp.path,opt->path_html); // chemin + tmp.lst=cache->lst; // fichier lst + filenote("",&tmp); // initialiser filecreate + } + + // supprimer old.txt + if (fexist(fconcat(opt->path_log,"hts-cache/old.txt"))) + remove(fconcat(opt->path_log,"hts-cache/old.txt")); + // renommer + if (fexist(fconcat(opt->path_log,"hts-cache/new.txt"))) + rename(fconcat(opt->path_log,"hts-cache/new.txt"),fconcat(opt->path_log,"hts-cache/old.txt")); + // ouvrir + cache->txt=fopen(fconcat(opt->path_log,"hts-cache/new.txt"),"wb"); + if (cache->txt) { + fprintf(cache->txt,"date\tsize'/'remotesize\tflags(request:Update,Range state:File response:Modified,Chunked,gZipped)\t"); + fprintf(cache->txt,"statuscode\tstatus ('servermsg')\tMIME\tEtag|Date\tURL\tlocalfile\t(from URL)"LF); + } } - - // supprimer old.txt - if (fexist(fconcat(opt->path_log,"hts-cache/old.txt"))) - remove(fconcat(opt->path_log,"hts-cache/old.txt")); - // renommer - if (fexist(fconcat(opt->path_log,"hts-cache/new.txt"))) - rename(fconcat(opt->path_log,"hts-cache/new.txt"),fconcat(opt->path_log,"hts-cache/old.txt")); - // ouvrir - cache->txt=fopen(fconcat(opt->path_log,"hts-cache/new.txt"),"wb"); - if (cache->txt) { - fprintf(cache->txt,"date\tsize'/'remotesize\tflags(request:Update,Range state:File response:Modified,Chunked,gZipped)\t"); - fprintf(cache->txt,"statuscode\tstatus ('servermsg')\tMIME\tEtag|Date\tURL\tlocalfile\t(from URL)"LF); + } else { + cache->dat=fopen(fconcat(opt->path_log,"hts-cache/new.dat"),"wb"); + cache->ndx=fopen(fconcat(opt->path_log,"hts-cache/new.ndx"),"wb"); + // les deux doivent être ouvrables + if ((cache->dat==NULL) && (cache->ndx!=NULL)) { + fclose(cache->ndx); + cache->ndx=NULL; + } + if ((cache->dat!=NULL) && (cache->ndx==NULL)) { + fclose(cache->dat); + cache->dat=NULL; } - // test - // cache_writedata(cache->ndx,cache->dat,"//[TEST]//","test1","TEST PIPO",9); + if (cache->ndx!=NULL) { + char s[256]; + + cache_wstr(cache->dat,"CACHE-1.5"); + fflush(cache->dat); + cache_wstr(cache->ndx,"CACHE-1.5"); + fflush(cache->ndx); + // + time_gmt_rfc822(s); // date et heure actuelle GMT pour If-Modified-Since.. + cache_wstr(cache->ndx,s); + fflush(cache->ndx); // un petit fflush au cas où + + // supprimer old.lst + if (fexist(fconcat(opt->path_log,"hts-cache/old.lst"))) + remove(fconcat(opt->path_log,"hts-cache/old.lst")); + // renommer + if (fexist(fconcat(opt->path_log,"hts-cache/new.lst"))) + rename(fconcat(opt->path_log,"hts-cache/new.lst"),fconcat(opt->path_log,"hts-cache/old.lst")); + // ouvrir + cache->lst=fopen(fconcat(opt->path_log,"hts-cache/new.lst"),"wb"); + { + filecreate_params tmp; + strcpybuff(tmp.path,opt->path_html); // chemin + tmp.lst=cache->lst; // fichier lst + filenote("",&tmp); // initialiser filecreate + } + + // supprimer old.txt + if (fexist(fconcat(opt->path_log,"hts-cache/old.txt"))) + remove(fconcat(opt->path_log,"hts-cache/old.txt")); + // renommer + if (fexist(fconcat(opt->path_log,"hts-cache/new.txt"))) + rename(fconcat(opt->path_log,"hts-cache/new.txt"),fconcat(opt->path_log,"hts-cache/old.txt")); + // ouvrir + cache->txt=fopen(fconcat(opt->path_log,"hts-cache/new.txt"),"wb"); + if (cache->txt) { + fprintf(cache->txt,"date\tsize'/'remotesize\tflags(request:Update,Range state:File response:Modified,Chunked,gZipped)\t"); + fprintf(cache->txt,"statuscode\tstatus ('servermsg')\tMIME\tEtag|Date\tURL\tlocalfile\t(from URL)"LF); + } + + // test + // cache_writedata(cache->ndx,cache->dat,"//[TEST]//","test1","TEST PIPO",9); + } } } else { @@ -906,12 +1595,11 @@ char* readfile_or(char* fil,char* defaultdata) { int cache_wstr(FILE* fp,char* s) { INTsys i; char buff[256+4]; - i=strlen(s); + i = s != NULL ? strlen(s) : 0; sprintf(buff,INTsysP "\n",i); if (fwrite(buff,1,(INTsys)strlen(buff),fp) != strlen(buff)) return -1; - if (i>0) - if ((INTsys)fwrite(s,1,i,fp) != i) + if (i > 0 && (INTsys)fwrite(s,1,i,fp) != i) return -1; return 0; } @@ -922,10 +1610,34 @@ void cache_rstr(FILE* fp,char* s) { sscanf(buff,INTsysP,&i); if (i < 0 || i > 32768) /* error, something nasty happened */ i=0; - if (i>0) - fread(s,1,i,fp); + if (i>0) { + if ((int) fread(s,1,i,fp) != i) { + int fread_cache_failed = 0; + assertf(fread_cache_failed); + } + } *(s+i)='\0'; } +char* cache_rstr_addr(FILE* fp) { + INTsys i; + char* addr = NULL; + char buff[256+4]; + linput(fp,buff,256); + sscanf(buff,INTsysP,&i); + if (i < 0 || i > 32768) /* error, something nasty happened */ + i=0; + if (i > 0) { + addr = malloct(i + 1); + if (addr != NULL) { + if ((int) fread(addr,1,i,fp) != i) { + int fread_cache_failed = 0; + assertf(fread_cache_failed); + } + *(addr+i)='\0'; + } + } + return addr; +} int cache_brstr(char* adr,char* s) { int i; int off; diff --git a/src/htscache.h b/src/htscache.h index ef897f1..51dd439 100644 --- a/src/htscache.h +++ b/src/htscache.h @@ -42,9 +42,12 @@ Please visit our Website: http://www.httrack.com #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + // cache void cache_mayadd(httrackp* opt,cache_back* cache,htsblk* r,char* url_adr,char* url_fil,char* url_save); -void cache_add(htsblk r,char* url_adr,char* url_fil,char* url_save,FILE* cache_ndx,FILE* cache_dat,int all_in_cache); +void cache_add(cache_back* cache,htsblk r,char* url_adr,char* url_fil,char* url_save,int all_in_cache); htsblk cache_read(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location); htsblk cache_read_ro(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location); htsblk cache_readex(httrackp* opt,cache_back* cache,char* adr,char* fil,char* save,char* location,char* return_save,int readonly); @@ -56,6 +59,7 @@ int cache_readdata(cache_back* cache,char* str1,char* str2,char** inbuff,int* le int cache_wstr(FILE* fp,char* s); void cache_rstr(FILE* fp,char* s); +char* cache_rstr_addr(FILE* fp); int cache_brstr(char* adr,char* s); int cache_quickbrstr(char* adr,char* s); int cache_brint(char* adr,int* i); @@ -63,4 +67,7 @@ void cache_rint(FILE* fp,int* i); int cache_wint(FILE* fp,int i); void cache_rLLint(FILE* fp,LLint* i); int cache_wLLint(FILE* fp,LLint i); + +#endif + #endif diff --git a/src/htscatchurl.c b/src/htscatchurl.c index 8455ea0..3832019 100644 --- a/src/htscatchurl.c +++ b/src/htscatchurl.c @@ -34,6 +34,9 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + // Fichier intercepteur d'URL .c /* specific definitions */ @@ -41,11 +44,9 @@ Please visit our Website: http://www.httrack.com #include "htsbase.h" #include "htsnet.h" #include "htslib.h" -#include -#include -#include -#include +#ifndef _WIN32_WCE #include +#endif #if HTS_WIN #else #include @@ -194,8 +195,8 @@ HTSEXT_API int catch_url(T_SOC soc,char* url,char* method,char* data) { socinput(soc,line,1000); if (strnotempty(line)) { if (sscanf(line,"%s %s %s",method,url,protocol) == 3) { - char url_adr[HTS_URLMAXSIZE*2]; - char url_fil[HTS_URLMAXSIZE*2]; + char BIGSTK url_adr[HTS_URLMAXSIZE*2]; + char BIGSTK url_fil[HTS_URLMAXSIZE*2]; // méthode en majuscule int i,r=0; url_adr[0]=url_fil[0]='\0'; @@ -207,7 +208,7 @@ HTSEXT_API int catch_url(T_SOC soc,char* url,char* method,char* data) { // adresse du lien if (ident_url_absolute(url,url_adr,url_fil)>=0) { // Traitement des en-têtes - char loc[HTS_URLMAXSIZE*2]; + char BIGSTK loc[HTS_URLMAXSIZE*2]; htsblk blkretour; memset(&blkretour, 0, sizeof(htsblk)); // effacer blkretour.location=loc; // si non nul, contiendra l'adresse véritable en cas de moved xx diff --git a/src/htscatchurl.h b/src/htscatchurl.h index a2514ef..cec7537 100644 --- a/src/htscatchurl.h +++ b/src/htscatchurl.h @@ -41,6 +41,9 @@ Please visit our Website: http://www.httrack.com #include "htsbasenet.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + // Fonctions void socinput(T_SOC soc,char* s,int max); #ifndef HTTRACK_DEFLIB @@ -74,5 +77,7 @@ HTSEXT_API int catch_url(T_SOC soc,char* url,char* method,char* data); #endif +#endif + diff --git a/src/htscore.c b/src/htscore.c index ba1e226..ff761ef 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -34,11 +34,12 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ -#include -#include -#include -#include +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + +#ifndef _WIN32_WCE #include +#endif #include /* File defs */ @@ -60,6 +61,10 @@ Please visit our Website: http://www.httrack.com // parser #include "htsparse.h" +/* Cache */ +#include "htszlib.h" + + /* END specific definitions */ @@ -71,6 +76,8 @@ t_hts_htmlcheck_uninit hts_htmlcheck_uninit = NULL; t_hts_htmlcheck_start hts_htmlcheck_start = NULL; t_hts_htmlcheck_end hts_htmlcheck_end = NULL; t_hts_htmlcheck_chopt hts_htmlcheck_chopt = NULL; +t_hts_htmlcheck_process hts_htmlcheck_preprocess = NULL; +t_hts_htmlcheck_process hts_htmlcheck_postprocess = NULL; t_hts_htmlcheck hts_htmlcheck = NULL; t_hts_htmlcheck_query hts_htmlcheck_query = NULL; t_hts_htmlcheck_query2 hts_htmlcheck_query2 = NULL; @@ -80,11 +87,13 @@ t_hts_htmlcheck_check hts_htmlcheck_check = NULL; t_hts_htmlcheck_pause hts_htmlcheck_pause = NULL; t_hts_htmlcheck_filesave hts_htmlcheck_filesave = NULL; t_hts_htmlcheck_linkdetected hts_htmlcheck_linkdetected = NULL; +t_hts_htmlcheck_linkdetected2 hts_htmlcheck_linkdetected2 = NULL; t_hts_htmlcheck_xfrstatus hts_htmlcheck_xfrstatus = NULL; t_hts_htmlcheck_savename hts_htmlcheck_savename = NULL; t_hts_htmlcheck_sendhead hts_htmlcheck_sendhead = NULL; t_hts_htmlcheck_receivehead hts_htmlcheck_receivehead = NULL; +extern void set_wrappers(void); char _hts_errmsg[1100]=""; int _hts_in_html_parsing=0; @@ -201,7 +210,7 @@ hts_htmlcheck_end(); \ if (back) { \ int i; \ for(i=0;i"CRLF,makeindex_firstlink); \ } else \ @@ -313,13 +332,13 @@ makeindex_done=1; /* ok c'est fait */ \ // Début de httpmirror, robot // url1 peut être multiple int httpmirror(char* url1,httrackp* ptropt) { - httrackp opt = *ptropt; // structure d'options + httrackp BIGSTK opt; // structure d'options char* primary=NULL; // première page, contenant les liens à scanner int lien_tot=0; // nombre de liens pour le moment lien_url** liens=NULL; // les pointeurs sur les liens hash_struct hash; // système de hachage, accélère la recherche dans les liens hash_struct* hashptr = &hash; - t_cookie cookie; // gestion des cookies + t_cookie BIGSTK cookie; // gestion des cookies int lien_max=0; int lien_size=0; // octets restants dans buffer liens dispo char* lien_buffer=NULL; // buffer liens actuel @@ -330,7 +349,7 @@ int httpmirror(char* url1,httrackp* ptropt) { int numero_passe=0; // deux passes pour html puis images int back_max=0; // fichiers qui peuvent être en local lien_back* back=NULL; // backing en local - htsblk r; // retour de certaines fonctions + htsblk BIGSTK r; // retour de certaines fonctions TStamp lastime=0; // pour affichage infos de tmp en tmp // pour les stats, nombre de fichiers & octets écrits LLint stat_fragment=0; // pour la fragmentation @@ -346,7 +365,7 @@ int httpmirror(char* url1,httrackp* ptropt) { int makeindex_done=0; // lorsque l'index sera fait FILE* makeindex_fp=NULL; int makeindex_links=0; - char makeindex_firstlink[HTS_URLMAXSIZE*2]; + char BIGSTK makeindex_firstlink[HTS_URLMAXSIZE*2]; // statistiques (mode #Z) FILE* makestat_fp=NULL; // fichier de stats taux transfert FILE* maketrack_fp=NULL; // idem pour le tracking @@ -354,16 +373,19 @@ int httpmirror(char* url1,httrackp* ptropt) { LLint makestat_total=0; // repère du nombre d'octets transférés depuis denrière stat int makestat_lnk=0; // idem, pour le nombre de liens // - char codebase[HTS_URLMAXSIZE*2]; // base pour applet java - char base[HTS_URLMAXSIZE*2]; // base pour les autres fichiers + char BIGSTK codebase[HTS_URLMAXSIZE*2]; // base pour applet java + char BIGSTK base[HTS_URLMAXSIZE*2]; // base pour les autres fichiers // - cache_back cache; - robots_wizard robots; // gestion robots.txt + cache_back BIGSTK cache; + robots_wizard BIGSTK robots; // gestion robots.txt inthash cache_hashtable=NULL; + inthash cache_tests=NULL; int cache_hash_size=0; // char *template_header=NULL,*template_body=NULL,*template_footer=NULL; // + opt = *ptropt; + // codebase[0]='\0'; base[0]='\0'; // cookie.auth.next=NULL; @@ -444,13 +466,16 @@ int httpmirror(char* url1,httrackp* ptropt) { if (!cache_hash_size) cache_hash_size=HTS_HASH_SIZE; cache_hashtable=inthash_new(cache_hash_size); - if (cache_hashtable==NULL) { + cache_tests=inthash_new(cache_hash_size); + if (cache_hashtable==NULL || cache_tests==NULL) { printf("PANIC! : Not enough memory [%d]\n",__LINE__); filters[0]=NULL; back_max=0; // uniquement a cause du warning de XH_extuninit XH_extuninit; return 0; } + inthash_value_is_malloc(cache_tests, 1); /* malloc */ cache.hashtable=(void*)cache_hashtable; /* copy backcache hash */ + cache.cached_tests=(void*)cache_tests; /* copy of cache_tests */ // initialiser cache DNS _hts_lockdns(-999); @@ -539,7 +564,7 @@ int httpmirror(char* url1,httrackp* ptropt) { if (joker) { // joker ou filters //char* p; - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; int type; int plus=0; // noter joker (dans b) @@ -598,7 +623,7 @@ int httpmirror(char* url1,httrackp* ptropt) { } } else { // adresse normale - char url[HTS_URLMAXSIZE*2]; + char BIGSTK url[HTS_URLMAXSIZE*2]; // prochaine adresse i=0; while((*a!=0) && (!isspace((unsigned char)*a))) { url[i++]=*a; a++; } @@ -638,7 +663,7 @@ int httpmirror(char* url1,httrackp* ptropt) { if (filelist_buff) { int filelist_ptr=0; int n=0; - char line[HTS_URLMAXSIZE*2]; + char BIGSTK line[HTS_URLMAXSIZE*2]; char* primary_ptr = primary + strlen(primary); while( filelist_ptr < filelist_sz ) { int count=binput(filelist_buff+filelist_ptr,line,HTS_URLMAXSIZE); @@ -758,6 +783,7 @@ int httpmirror(char* url1,httrackp* ptropt) { makestat_fp=fopen(fconcat(opt.path_log,"hts-stats.txt"),"wb"); if (makestat_fp != NULL) { fprintf(makestat_fp,"HTTrack statistics report, every minutes"LF LF); + fflush(makestat_fp); } } @@ -766,6 +792,7 @@ int httpmirror(char* url1,httrackp* ptropt) { maketrack_fp=fopen(fconcat(opt.path_log,"hts-track.txt"),"wb"); if (maketrack_fp != NULL) { fprintf(maketrack_fp,"HTTrack tracking report, every minutes"LF LF); + fflush(maketrack_fp); } } @@ -776,6 +803,10 @@ int httpmirror(char* url1,httrackp* ptropt) { } } + /* Send options to callback functions */ +#if HTS_ANALYSTE + hts_htmlcheck_chopt(&opt); +#endif // attendre une certaine heure.. if (opt.waittime>0) { @@ -795,6 +826,7 @@ int httpmirror(char* url1,httrackp* ptropt) { } // attendre.. + _hts_in_html_parsing=5; do { TStamp tl=0; time_t tt; @@ -828,6 +860,7 @@ int httpmirror(char* url1,httrackp* ptropt) { } #endif } while(!ok); + _hts_in_html_parsing=0; // note: recopie de plus haut // noter heure actuelle de départ en secondes @@ -854,6 +887,7 @@ int httpmirror(char* url1,httrackp* ptropt) { XH_extuninit; return 1; } + set_wrappers(); // _start() is allowed to set other wrappers #endif @@ -865,7 +899,7 @@ int httpmirror(char* url1,httrackp* ptropt) { do { int error=0; // si error alors sauter int store_errpage=0; // c'est une erreur mais on enregistre le html - char loc[HTS_URLMAXSIZE*2]; // adresse de relocation + char BIGSTK loc[HTS_URLMAXSIZE*2]; // adresse de relocation // Ici on charge le fichier (html, gif..) en mémoire // Les HTMLs sont traités (si leur priorité est suffisante) @@ -877,6 +911,9 @@ int httpmirror(char* url1,httrackp* ptropt) { memcpy(&(r.req.proxy), &opt.proxy, sizeof(opt.proxy)); // et user-agent strcpybuff(r.req.user_agent,opt.user_agent); + strcpybuff(r.req.referer,opt.referer); + strcpybuff(r.req.from,opt.from); + strcpybuff(r.req.lang_iso,opt.lang_iso); r.req.user_agent_send=opt.user_agent_send; if (!error) { @@ -928,9 +965,9 @@ int httpmirror(char* url1,httrackp* ptropt) { Get the next link, waiting for other files, handling external callbacks */ { - char buff_err_msg[1024]; - htsmoduleStruct str; - htsmoduleStructExtended stre; + char BIGSTK buff_err_msg[1024]; + htsmoduleStruct BIGSTK str; + htsmoduleStructExtended BIGSTK stre; buff_err_msg[0] = '\0'; memset(&str, 0, sizeof(str)); memset(&stre, 0, sizeof(stre)); @@ -1018,7 +1055,7 @@ int httpmirror(char* url1,httrackp* ptropt) { } else { // lien vide.. - if (opt.errlog) { + if (opt.errlog && opt.debug > 0) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Warning, link #%d empty"LF,ptr); test_flush; } error=1; @@ -1067,8 +1104,8 @@ int httpmirror(char* url1,httrackp* ptropt) { if (!error) { if (r.statuscode == 200) { // OK (ou 304 en backing) if (r.adr) { // Written file - if ( (is_hypertext_mime(r.contenttype)) /* Is HTML or Js, .. */ - || (may_be_hypertext_mime(r.contenttype) && (r.adr) ) /* Is real media, .. */ + if ( (is_hypertext_mime(r.contenttype, urlfil)) /* Is HTML or Js, .. */ + || (may_be_hypertext_mime(r.contenttype, urlfil) && (r.adr) ) /* Is real media, .. */ ) { if (strnotempty(r.cdispo)) { // Content-disposition set! if (ishtml(savename) == 0) { // Non HTML!! @@ -1083,8 +1120,8 @@ int httpmirror(char* url1,httrackp* ptropt) { // ------------------------------------ // BOGUS MIME TYPE HACK II (the revenge) // Check if we have a bogus MIME type - if ( (is_hypertext_mime(r.contenttype)) /* Is HTML or Js, .. */ - || (may_be_hypertext_mime(r.contenttype)) /* Is real media, .. */ + if ( (is_hypertext_mime(r.contenttype, urlfil)) /* Is HTML or Js, .. */ + || (may_be_hypertext_mime(r.contenttype, urlfil)) /* Is real media, .. */ ) { if ((r.adr) && (r.size)) { unsigned int map[256]; @@ -1159,11 +1196,11 @@ int httpmirror(char* url1,httrackp* ptropt) { if (!error) { if (r.statuscode == 200) { // OK (ou 304 en backing) if (r.adr==NULL) { // Written file - if (may_be_hypertext_mime(r.contenttype)) { // to parse! + if (may_be_hypertext_mime(r.contenttype, urlfil)) { // to parse! LLint sz; sz=fsize(savename); if (sz>0) { // ok, exists! - if (sz < 1024) { // ok, small file --> to parse! + if (sz < 8192) { // ok, small file --> to parse! FILE* fp=fopen(savename,"rb"); if (fp) { r.adr=malloct((int)sz + 2); @@ -1216,9 +1253,9 @@ int httpmirror(char* url1,httrackp* ptropt) { redirect pages. */ if (!error) { - char buff_err_msg[1024]; - htsmoduleStruct str; - htsmoduleStructExtended stre; + char BIGSTK buff_err_msg[1024]; + htsmoduleStruct BIGSTK str; + htsmoduleStructExtended BIGSTK stre; buff_err_msg[0] = '\0'; memset(&str, 0, sizeof(str)); memset(&stre, 0, sizeof(stre)); @@ -1346,8 +1383,8 @@ int httpmirror(char* url1,httrackp* ptropt) { // traiter if ( - ( (is_hypertext_mime(r.contenttype)) /* Is HTML or Js, .. */ - || (may_be_hypertext_mime(r.contenttype) && (r.adr) ) /* Is real media, .. */ + ( (is_hypertext_mime(r.contenttype, urlfil)) /* Is HTML or Js, .. */ + || (may_be_hypertext_mime(r.contenttype, urlfil) && (r.adr) ) /* Is real media, .. */ ) && (liens[ptr]->depth>0) /* Depth > 0 (recurse depth) */ && (r.adr!=NULL) /* HTML Data exists */ @@ -1363,9 +1400,9 @@ int httpmirror(char* url1,httrackp* ptropt) { fspc(opt.log,"info"); fprintf(opt.log,"engine: check-html: %s%s"LF,urladr,urlfil); } { - char buff_err_msg[1024]; - htsmoduleStruct str; - htsmoduleStructExtended stre; + char BIGSTK buff_err_msg[1024]; + htsmoduleStruct BIGSTK str; + htsmoduleStructExtended BIGSTK stre; buff_err_msg[0] = '\0'; memset(&str, 0, sizeof(str)); memset(&stre, 0, sizeof(stre)); @@ -1507,9 +1544,9 @@ int httpmirror(char* url1,httrackp* ptropt) { if (strcmp(urlfil,"/robots.txt")==0) { // robots.txt if (r.adr) { int bptr=0; - char line[1024]; - char buff[8192]; - char infobuff[8192]; + char BIGSTK line[1024]; + char BIGSTK buff[8192]; + char BIGSTK infobuff[8192]; int record=0; line[0]='\0'; buff[0]='\0'; infobuff[0]='\0'; // @@ -1553,7 +1590,7 @@ int httpmirror(char* url1,httrackp* ptropt) { while(is_realspace(*a)) a++; // sauter espace(s) if (strnotempty(a)) { - if (strcmp(a,"/") != 0) { /* ignoring disallow: / */ + if (strcmp(a,"/") != 0 || opt.robots >= 3) { /* ignoring disallow: / */ if ( (strlen(buff) + strlen(a) + 8) < sizeof(buff)) { strcatbuff(buff,a); strcatbuff(buff,"\n"); @@ -1601,8 +1638,8 @@ int httpmirror(char* url1,httrackp* ptropt) { // Si par la suite on doit retraiter ce fichier avec un niveau de récursion plus // fort, on supprimera le readme, et on scannera le fichier html! // note: sauté si store_errpage (càd si page d'erreur, non à scanner!) - if ( (is_hypertext_mime(r.contenttype)) && (!store_errpage) && (r.size>0)) { // c'est du html!! - char tempo[HTS_URLMAXSIZE*2]; + if ( (is_hypertext_mime(r.contenttype, urlfil)) && (!store_errpage) && (r.size>0)) { // c'est du html!! + char BIGSTK tempo[HTS_URLMAXSIZE*2]; FILE* fp; tempo[0]='\0'; strcpybuff(tempo,savename); @@ -1695,7 +1732,7 @@ int httpmirror(char* url1,httrackp* ptropt) { FILE* fp=fopen(savename,"r+b"); if (fp) { if (!fseek(fp,0,SEEK_SET)) { - char line[HTS_URLMAXSIZE*2]; + char BIGSTK line[HTS_URLMAXSIZE*2]; linput(fp,line,HTS_URLMAXSIZE); if (strnotempty(line)) { if ((opt.debug>1) && (opt.log!=NULL)) { @@ -1711,8 +1748,8 @@ int httpmirror(char* url1,httrackp* ptropt) { /* External modules */ if (opt.parsejava && fexist(savename)) { - char buff_err_msg[1024]; - htsmoduleStruct str; + char BIGSTK buff_err_msg[1024]; + htsmoduleStruct BIGSTK str; buff_err_msg[0] = '\0'; memset(&str, 0, sizeof(str)); /* */ @@ -1889,7 +1926,7 @@ jump_if_done: while(!feof(old_lst)) { linput(old_lst,line,1000); if (!strstr(adr,line)) { // fichier non trouvé dans le nouveau? - char file[HTS_URLMAXSIZE*2]; + char BIGSTK file[HTS_URLMAXSIZE*2]; strcpybuff(file,opt.path_html); strcatbuff(file,line+1); file[strlen(file)-1]='\0'; @@ -1912,7 +1949,7 @@ jump_if_done: line[strlen(line)-1]='\0'; if (strnotempty(line)) if (!strstr(adr,line)) { // non trouvé? - char file[HTS_URLMAXSIZE*2]; + char BIGSTK file[HTS_URLMAXSIZE*2]; strcpybuff(file,opt.path_html); strcatbuff(file,line+1); while ((strnotempty(file)) && (rmdir(file)==0)) { // ok, éliminé (existait) @@ -1956,26 +1993,28 @@ jump_if_done: // afficher résumé dans log if (opt.log!=NULL) { + char BIGSTK finalInfo[8192]; int error = fspc(NULL,"error"); int warning = fspc(NULL,"warning"); int info = fspc(NULL,"info"); - char htstime[256]; - char infoupdated[256]; + char BIGSTK htstime[256]; + char BIGSTK infoupdated[256]; // int n=(int) (stat_loaded/(time_local()-HTS_STAT.stat_timestart)); LLint n=(LLint) (HTS_STAT.HTS_TOTAL_RECV/(max(1,time_local()-HTS_STAT.stat_timestart))); sec2str(htstime,time_local()-HTS_STAT.stat_timestart); - //fprintf(opt.log,LF"HTS-mirror complete in %s : %d links scanned, %d files written (%d bytes overall) [%d bytes received at %d bytes/sec]"LF,htstime,lien_tot-1,HTS_STAT.stat_files,stat_bytes,stat_loaded,n); + //sprintf(finalInfo + strlen(finalInfo),LF"HTS-mirror complete in %s : %d links scanned, %d files written (%d bytes overall) [%d bytes received at %d bytes/sec]"LF,htstime,lien_tot-1,HTS_STAT.stat_files,stat_bytes,stat_loaded,n); infoupdated[0] = '\0'; if (opt.is_update) { - if (HTS_STAT.stat_updated_files < 0) { + if (HTS_STAT.stat_updated_files > 0) { sprintf(infoupdated, ", %d files updated", (int)HTS_STAT.stat_updated_files); } else { sprintf(infoupdated, ", no files updated"); } } - fprintf(opt.log,LF - "HTTrack mirror complete in %s : " + finalInfo[0] = '\0'; + sprintf(finalInfo + strlen(finalInfo), + "HTTrack Website Copier/"HTTRACK_VERSION" mirror complete in %s : " "%d links scanned, %d files written ("LLintP" bytes overall)%s " "["LLintP" bytes received at "LLintP" bytes/sec]", htstime, @@ -1985,20 +2024,31 @@ jump_if_done: infoupdated, (LLint)HTS_STAT.HTS_TOTAL_RECV, (LLint)n - ); + ); + if (HTS_STAT.total_packed > 0 && HTS_STAT.total_unpacked > 0) { int packed_ratio=(int)((LLint)(HTS_STAT.total_packed*100)/HTS_STAT.total_unpacked); - fprintf(opt.log,", "LLintP" bytes transfered using HTTP compression in %d files, ratio %d%%",(LLint)HTS_STAT.total_unpacked,HTS_STAT.total_packedfiles,(int)packed_ratio); + sprintf(finalInfo + strlen(finalInfo),", "LLintP" bytes transfered using HTTP compression in %d files, ratio %d%%",(LLint)HTS_STAT.total_unpacked,HTS_STAT.total_packedfiles,(int)packed_ratio); } if (!opt.nokeepalive && HTS_STAT.stat_sockid > 0 && HTS_STAT.stat_nrequests > HTS_STAT.stat_sockid) { int rq = (HTS_STAT.stat_nrequests * 10) / HTS_STAT.stat_sockid; - fprintf(opt.log,", %d.%d requests per connection", rq/10, rq%10); + sprintf(finalInfo + strlen(finalInfo),", %d.%d requests per connection", rq/10, rq%10); } - fprintf(opt.log,LF); + sprintf(finalInfo + strlen(finalInfo),LF); if (error) - fprintf(opt.log,"(%d errors, %d warnings, %d messages)"LF,error,warning,info); + sprintf(finalInfo + strlen(finalInfo),"(%d errors, %d warnings, %d messages)"LF,error,warning,info); else - fprintf(opt.log,"(No errors, %d warnings, %d messages)"LF,warning,info); + sprintf(finalInfo + strlen(finalInfo),"(No errors, %d warnings, %d messages)"LF,warning,info); + + // Log + fprintf(opt.log,LF"%s", finalInfo); + + // Close ZIP + if (cache.zipOutput) { + zipClose(cache.zipOutput, finalInfo); + cache.zipOutput = NULL; + } + test_flush; } #if DEBUG_HASH @@ -2301,7 +2351,7 @@ int filters_init(char*** ptrfilters, int maxfilter, int filterinc) { HTSEXT_API int structcheck(char* s) { // vérifier la présence des dossier(s) char *a=s; - char nom[HTS_URLMAXSIZE*2]; + char BIGSTK nom[HTS_URLMAXSIZE*2]; char *b; //inthash structcheck_hash=NULL; if (strnotempty(s)==0) return 0; @@ -2399,7 +2449,7 @@ int check_fatal_io_errno(void) { // ouvrir un fichier (avec chemin Un*x) FILE* filecreate(char* s) { - char fname[HTS_URLMAXSIZE*2]; + char BIGSTK fname[HTS_URLMAXSIZE*2]; FILE* fp; fname[0]='\0'; @@ -2464,7 +2514,7 @@ int filenote(char* s,filecreate_params* params) { strc->lst=params->lst; return 0; } else if (strc->lst) { - char savelst[HTS_URLMAXSIZE*2]; + char BIGSTK savelst[HTS_URLMAXSIZE*2]; strcpybuff(savelst,fslash(s)); // couper chemin? if (strnotempty(strc->path)) { @@ -2515,7 +2565,7 @@ HTS_INLINE void usercommand(httrackp* opt,int _exe,char* _cmd,char* file,char* a } } void usercommand_exe(char* cmd,char* file) { - char temp[8192]; + char BIGSTK temp[8192]; char c[2]=""; int i; temp[0]='\0'; @@ -2554,7 +2604,7 @@ static void postprocess_file(httrackp* opt,char* save, char* adr, char* fil) { first = 1; opt->state.mimefp = fopen(fconcat(opt->path_html,"index.mht"), "wb"); if (opt->state.mimefp != NULL) { - char rndtmp[1024], currtime[256]; + char BIGSTK rndtmp[1024], currtime[256]; srand(time(NULL)); time_gmt_rfc822(currtime); sprintf(rndtmp, "%d_%d", (int)time(NULL), (int) rand()); @@ -2583,7 +2633,7 @@ static void postprocess_file(httrackp* opt,char* save, char* adr, char* fil) { if (fp != NULL) { char buff[60*100 + 2]; char mimebuff[256]; - char cid[HTS_URLMAXSIZE*3]; + char BIGSTK cid[HTS_URLMAXSIZE*3]; int len; int isHtml = ( ishtml(save) == 1 ); mimebuff[0] = '\0'; @@ -2730,13 +2780,31 @@ HTS_INLINE int back_fillmax(lien_back* back,int back_max,httrackp* opt,cache_bac return -1; /* plus de place */ } -// remplir backing -int back_fill(lien_back* back,int back_max,httrackp* opt,cache_back* cache,lien_url** liens,int ptr,int numero_passe,int lien_tot) { +int back_pluggable_sockets_strict(lien_back* back, int back_max, httrackp* opt) { + int n = opt->maxsoc - back_nsoc(back, back_max); + + // connect limiter + if (n > 0 && opt->maxconn > 0 && HTS_STAT.last_connect > 0) { + TStamp opTime = HTS_STAT.last_request ? HTS_STAT.last_request : HTS_STAT.last_connect; + TStamp cTime = mtime_local(); + TStamp lap = ( cTime - opTime ); + TStamp minLap = (TStamp) ( 1000.0 / opt->maxconn ); + if (lap < minLap) { + n = 0; + } else { + int nMax = (int) ( lap / minLap ); + n = min(n, nMax); + } + } + + return n; +} + +int back_pluggable_sockets(lien_back* back, int back_max, httrackp* opt) { int n; - int oneLess = ( (_hts_in_html_parsing == 2 && opt->maxsoc >= 2) || (_hts_in_html_parsing == 1 && opt->maxsoc >= 4) ) ? 1 : 0; // testing links // ajouter autant de socket qu'on peut ajouter - n=opt->maxsoc-back_nsoc(back,back_max) - oneLess; + n=back_pluggable_sockets_strict(back, back_max, opt); // vérifier qu'il restera assez de place pour les tests ensuite (en théorie, 1 entrée libre restante suffirait) n=min( n, back_available(back,back_max) - 8 ); @@ -2745,6 +2813,12 @@ int back_fill(lien_back* back,int back_max,httrackp* opt,cache_back* cache,lien_ if (back_stack_available(back,back_max) <= 2) n=0; + return n; +} + +// remplir backing +int back_fill(lien_back* back,int back_max,httrackp* opt,cache_back* cache,lien_url** liens,int ptr,int numero_passe,int lien_tot) { + int n = back_pluggable_sockets(back, back_max, opt); if (n>0) { int p; @@ -2886,11 +2960,7 @@ void sig_ask( int code ) { // demander void sig_ignore( int code ) { // ignorer signal } void sig_brpipe( int code ) { // treat if necessary - /* - if (!sig_ignore_flag(-1)) { - sig_term(code); - } - */ + signal(code, sig_brpipe); } void sig_doback(int blind) { // mettre en backing int out=-1; @@ -2943,7 +3013,11 @@ int read_stdin(char* s,int max) { } #ifdef _WIN32 HTS_INLINE int check_stdin(void) { +#ifndef _WIN32_WCE return (_kbhit()); +#else + return 0; +#endif } #else HTS_INLINE int check_flot(T_SOC s) { @@ -3043,7 +3117,7 @@ char* next_token(char* p,int flag) { else if (*(p+1)=='"') c='"'; if (c) { - char tempo[8192]; + char BIGSTK tempo[8192]; tempo[0]=c; tempo[1]='\0'; strcatbuff(tempo,p+2); strcpybuff(p,tempo); @@ -3051,7 +3125,7 @@ char* next_token(char* p,int flag) { } } else if (*p==34) { // guillemets (de fin) - char tempo[8192]; + char BIGSTK tempo[8192]; tempo[0]='\0'; strcatbuff(tempo,p+1); strcpybuff(p,tempo); /* wipe "" */ @@ -3181,6 +3255,10 @@ HTSEXT_API int hts_is_testing(void) { // 0 non 1 test 2 purge return 2; else if (_hts_in_html_parsing==4) return 3; + else if (_hts_in_html_parsing==5) // scheduling + return 4; + else if (_hts_in_html_parsing==6) // wait for slot + return 5; return 0; } HTSEXT_API int hts_is_exiting(void) { @@ -3254,6 +3332,9 @@ HTSEXT_API int copy_htsopt(httrackp* from,httrackp* to) { if (from->maxrate > -1) to->maxrate = from->maxrate; + if (from->maxconn > 0) + to->maxconn = from->maxconn; + if (strnotempty(from->user_agent)) strcpybuff(to->user_agent , from->user_agent); @@ -3303,10 +3384,10 @@ int htsAddLink(htsmoduleStruct* str, char* link) { char* lien_buffer = * ( (char**) (str->lien_buffer_) ); /* */ /* */ - char adr[HTS_URLMAXSIZE*2], + char BIGSTK adr[HTS_URLMAXSIZE*2], fil[HTS_URLMAXSIZE*2], save[HTS_URLMAXSIZE*2]; - char codebase[HTS_URLMAXSIZE*2]; + char BIGSTK codebase[HTS_URLMAXSIZE*2]; /* */ int pass_fix, prio_fix; /* */ @@ -3321,7 +3402,7 @@ int htsAddLink(htsmoduleStruct* str, char* link) { // #if HTS_ANALYSTE - if (!hts_htmlcheck_linkdetected(link)) { + if (!hts_htmlcheck_linkdetected(link) || !hts_htmlcheck_linkdetected2(link, NULL)) { if (opt->errlog) { fspc(opt->errlog,"error"); fprintf(opt->errlog,"Link %s refused by external wrapper"LF, link); test_flush; @@ -3347,7 +3428,7 @@ int htsAddLink(htsmoduleStruct* str, char* link) { *(a+1)='\0'; // couper } else { // couper http:// éventuel if (strfield(codebase,"http://")) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; char* a=codebase+7; a=strchr(a,'/'); // après host if (a) { // ** msg erreur et vérifier? @@ -3382,6 +3463,7 @@ int htsAddLink(htsmoduleStruct* str, char* link) { int just_test_it = 0; forbidden_url = hts_acceptlink(opt, ptr, lien_tot, liens, adr,fil, + NULL, NULL, &set_prio_to, &just_test_it); if ((opt->debug>1) && (opt->log!=NULL)) { @@ -3391,7 +3473,7 @@ int htsAddLink(htsmoduleStruct* str, char* link) { /* Link accepted */ if (!forbidden_url) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; int a,b; tempo[0]='\0'; a=opt->savename_type; diff --git a/src/htscore.h b/src/htscore.h index d9e5d0a..97c0127 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -44,16 +44,25 @@ Please visit our Website: http://www.httrack.com /* specific definitions */ #include "htsbase.h" // Includes & définitions -#include -#include -#include +#ifdef HAVE_SYS_TYPES_H #include +#endif +#ifdef HAVE_SYS_STAT_H #include +#endif #ifdef _WIN32 +#ifndef _WIN32_WCE #include +#endif +#ifndef _WIN32_WCE #include #include #else +#ifndef HTS_CECOMPAT +#include "signal.h" +#endif +#endif +#else #include #ifdef HAVE_UNISTD_H #include @@ -68,7 +77,7 @@ Please visit our Website: http://www.httrack.com #include "htsopt.h" // structure d'un lien -typedef struct { +typedef struct lien_url { char firstblock; // flag 1=premier malloc char link_import; // lien importé à la suite d'un moved - ne pas appliquer les règles classiques up/down int depth; // profondeur autorisée lien ; >0 forte 0=faible @@ -93,7 +102,7 @@ typedef struct { } lien_url; // chargement de fichiers en 'arrière plan' -typedef struct { +typedef struct lien_back { #if DEBUG_CHECKINT char magic; #endif @@ -137,8 +146,10 @@ typedef struct { #endif } lien_back; +typedef struct cache_back_zip_entry cache_back_zip_entry; + // cache -typedef struct { +typedef struct cache_back { int version; // 0 ou 1 /* */ int type; @@ -150,15 +161,23 @@ typedef struct { char lastmodified[256]; // HASH void* hashtable; + // HASH for tests (naming subsystem) + void* cached_tests; // fichiers log optionnels FILE* log; FILE* errlog; // variables int ptr_ant; // pointeur pour anticiper int ptr_last; // pointeur pour anticiper + // + void* zipInput; + void* zipOutput; + cache_back_zip_entry* zipEntries; + int zipEntriesOffs; + int zipEntriesCapa; } cache_back; -typedef struct { +typedef struct hash_struct { lien_url** liens; // pointeur sur liens int max_lien; // indice le plus grand rencontré int hash[3][HTS_HASH_SIZE]; // tables pour sav/adr-fil/former_adr-former_fil @@ -169,11 +188,24 @@ typedef struct { #define hash_write(A,B) #endif -typedef struct { +typedef struct filecreate_params { FILE* lst; char path[HTS_URLMAXSIZE*2]; } filecreate_params; +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + +static int cache_writable(cache_back* cache) { + return (cache != NULL && ( cache->dat != NULL || cache->zipOutput != NULL ) ); +} + +static int cache_readable(cache_back* cache) { + return (cache != NULL && ( cache->olddat != NULL || cache->zipInput != NULL ) ); +} + +#endif + // Fonctions // INCLUDES .H PARTIES DE CODE HTTRACK @@ -240,6 +272,7 @@ typedef void (* t_hts_htmlcheck_uninit)(void); typedef int (* t_hts_htmlcheck_start)(httrackp* opt); typedef int (* t_hts_htmlcheck_end)(void); typedef int (* t_hts_htmlcheck_chopt)(httrackp* opt); +typedef int (* t_hts_htmlcheck_process)(char** html,int* len,char* url_adresse,char* url_fichier); typedef int (* t_hts_htmlcheck)(char* html,int len,char* url_adresse,char* url_fichier); typedef char* (* t_hts_htmlcheck_query)(char* question); typedef char* (* t_hts_htmlcheck_query2)(char* question); @@ -249,6 +282,7 @@ typedef int (* t_hts_htmlcheck_check)(char* adr,char* fil,int status); typedef void (* t_hts_htmlcheck_pause)(char* lockfile); typedef void (* t_hts_htmlcheck_filesave)(char* file); typedef int (* t_hts_htmlcheck_linkdetected)(char* link); +typedef int (* t_hts_htmlcheck_linkdetected2)(char* link, char* tag_start); typedef int (* t_hts_htmlcheck_xfrstatus)(lien_back* back); typedef int (* t_hts_htmlcheck_savename)(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save); typedef int (* t_hts_htmlcheck_sendhead)(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* outgoing); @@ -264,6 +298,8 @@ extern t_hts_htmlcheck_uninit hts_htmlcheck_uninit; extern t_hts_htmlcheck_start hts_htmlcheck_start; extern t_hts_htmlcheck_end hts_htmlcheck_end; extern t_hts_htmlcheck_chopt hts_htmlcheck_chopt; +extern t_hts_htmlcheck_process hts_htmlcheck_preprocess; +extern t_hts_htmlcheck_process hts_htmlcheck_postprocess; extern t_hts_htmlcheck hts_htmlcheck; extern t_hts_htmlcheck_query hts_htmlcheck_query; extern t_hts_htmlcheck_query2 hts_htmlcheck_query2; @@ -273,11 +309,16 @@ extern t_hts_htmlcheck_check hts_htmlcheck_check; extern t_hts_htmlcheck_pause hts_htmlcheck_pause; extern t_hts_htmlcheck_filesave hts_htmlcheck_filesave; extern t_hts_htmlcheck_linkdetected hts_htmlcheck_linkdetected; +extern t_hts_htmlcheck_linkdetected2 hts_htmlcheck_linkdetected2; extern t_hts_htmlcheck_xfrstatus hts_htmlcheck_xfrstatus; extern t_hts_htmlcheck_savename hts_htmlcheck_savename; extern t_hts_htmlcheck_sendhead hts_htmlcheck_sendhead; extern t_hts_htmlcheck_receivehead hts_htmlcheck_receivehead; */ + +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + // #ifndef HTTRACK_DEFLIB HTSEXT_API int hts_is_parsing(int flag); @@ -307,8 +348,6 @@ extern char** _hts_addurl; extern int _hts_cancel; #endif - - // @@ -342,6 +381,8 @@ int liens_record(char* adr,char* fil,char* save,char* former_adr,char* former_fi // backing, routines externes +int back_pluggable_sockets(lien_back* back, int back_max, httrackp* opt); +int back_pluggable_sockets_strict(lien_back* back, int back_max, httrackp* opt); int back_fill(lien_back* back,int back_max,httrackp* opt,cache_back* cache,lien_url** liens,int ptr,int numero_passe,int lien_tot); int backlinks_done(lien_url** liens,int lien_tot,int ptr); int back_fillmax(lien_back* back,int back_max,httrackp* opt,cache_back* cache,lien_url** liens,int ptr,int numero_passe,int lien_tot); @@ -395,4 +436,6 @@ void voidf(void); #endif +#endif + diff --git a/src/htscoremain.c b/src/htscoremain.c index 1162c18..bd90593 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -35,6 +35,9 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htscoremain.h" #include "htsglobal.h" @@ -43,6 +46,7 @@ Please visit our Website: http://www.httrack.com #include "htsalias.h" #include "htswrap.h" #include "htsmodules.h" +#include "htszlib.h" #include #if HTS_WIN @@ -100,7 +104,7 @@ extern int IPV6_resolver; } \ } while(0) -static void set_wrappers(void) { +void set_wrappers(void) { #if HTS_ANALYSTE // custom wrappers hts_htmlcheck_init = (t_hts_htmlcheck_init) htswrap_read("init"); @@ -108,6 +112,8 @@ static void set_wrappers(void) { hts_htmlcheck_start = (t_hts_htmlcheck_start) htswrap_read("start"); hts_htmlcheck_end = (t_hts_htmlcheck_end) htswrap_read("end"); hts_htmlcheck_chopt = (t_hts_htmlcheck_chopt) htswrap_read("change-options"); + hts_htmlcheck_preprocess = (t_hts_htmlcheck_process) htswrap_read("preprocess-html"); + hts_htmlcheck_postprocess = (t_hts_htmlcheck_process) htswrap_read("postprocess-html"); hts_htmlcheck = (t_hts_htmlcheck) htswrap_read("check-html"); hts_htmlcheck_query = (t_hts_htmlcheck_query) htswrap_read("query"); hts_htmlcheck_query2 = (t_hts_htmlcheck_query2) htswrap_read("query2"); @@ -117,6 +123,7 @@ static void set_wrappers(void) { hts_htmlcheck_pause = (t_hts_htmlcheck_pause) htswrap_read("pause"); hts_htmlcheck_filesave = (t_hts_htmlcheck_filesave) htswrap_read("save-file"); hts_htmlcheck_linkdetected = (t_hts_htmlcheck_linkdetected) htswrap_read("link-detected"); + hts_htmlcheck_linkdetected2 = (t_hts_htmlcheck_linkdetected2) htswrap_read("link-detected2"); hts_htmlcheck_xfrstatus = (t_hts_htmlcheck_xfrstatus) htswrap_read("transfer-status"); hts_htmlcheck_savename = (t_hts_htmlcheck_savename) htswrap_read("save-name"); hts_htmlcheck_sendhead = (t_hts_htmlcheck_sendhead) htswrap_read("send-header"); @@ -130,13 +137,9 @@ HTSEXT_API int hts_main(int argc, char **argv) { #else int main(int argc, char **argv) { #endif - char* x_argv[999]; // Patch pour argv et argc: en cas de récupération de ligne de commande + char** x_argv=NULL; // Patch pour argv et argc: en cas de récupération de ligne de commande char* x_argvblk=NULL; // (reprise ou update) int x_ptr=0; // offset - /* - char* x_argv2[999]; // Patch pour config - char* x_argvblk2=NULL; - */ // int argv_url=-1; // ==0 : utiliser cache et doit.log char* argv_firsturl=NULL; // utilisé pour nommage par défaut @@ -144,13 +147,13 @@ int main(int argc, char **argv) { int url_sz = 65535; //char url[65536]; // URLS séparées par un espace // the parametres - httrackp httrack; + httrackp BIGSTK httrack; int httrack_logmode=3; // ONE log file - int recuperer=0; // récupérer un plantage (n'arrive jamais, à supprimer) + int recuperer=0; // récupérer un plantage (n'arrive jamais, à supprimer) #if HTS_WIN #if HTS_ANALYSTE!=2 WORD wVersionRequested; /* requested version WinSock API */ - WSADATA wsadata; /* Windows Sockets API data */ + WSADATA BIGSTK wsadata; /* Windows Sockets API data */ #endif #else #ifndef HTS_DO_NOT_USE_UID @@ -197,6 +200,8 @@ int main(int argc, char **argv) { strcpybuff(httrack.proxy.bindhost, ""); // bind default host httrack.user_agent_send=1; // envoyer un user-agent strcpybuff(httrack.user_agent,"Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)"); + strcpybuff(httrack.referer, ""); + strcpybuff(httrack.from, ""); httrack.savename_83=0; // noms longs par défaut httrack.savename_type=0; // avec structure originale httrack.mimehtml=0; // pas MIME-html @@ -218,6 +223,7 @@ int main(int argc, char **argv) { httrack.nocompression=0; // pas de compression httrack.tolerant=0; // ne pas accepter content-length incorrect httrack.parseall=1; // tout parser (tags inconnus, par exemple) + httrack.parsedebug=0; // pas de mode débuggage httrack.norecatch=0; // ne pas reprendre les fichiers effacés par l'utilisateur httrack.verbosedisplay=0; // pas d'animation texte httrack.sizehack=0; // size hack @@ -238,19 +244,25 @@ int main(int argc, char **argv) { strcpybuff(httrack.path_log,""); strcpybuff(httrack.path_bin,""); // +#if HTS_SPARE_MEMORY==0 httrack.maxlink=100000; // 100,000 liens max par défaut (400Kb) httrack.maxfilter=200; // 200 filtres max par défaut +#else + httrack.maxlink=10000; // 10,000 liens max par défaut (40Kb) + httrack.maxfilter=50; // 50 filtres max par défaut +#endif httrack.maxcache=1048576*32; // a peu près 32Mo en cache max -- OPTION NON PARAMETRABLE POUR L'INSTANT -- //httrack.maxcache_anticipate=256; // maximum de liens à anticiper httrack.maxtime=-1; // temps max en secondes - httrack.maxrate=-1; // pas de taux maxi - httrack.maxconn=10; // nombre connexions/s + httrack.maxrate=25000; // default max rate + httrack.maxconn=5.0; // nombre connexions/s httrack.waittime=-1; // wait until.. hh*3600+mm*60+ss // httrack.exec=argv[0]; httrack.is_update=0; // not an update (yet) httrack.dir_topindex=0; // do not built top index (yet) // + httrack.bypass_limits=0; // enforce limits by default httrack.state.stop=0; // stopper httrack.state.exit_xh=0; // abort // @@ -337,6 +349,15 @@ int main(int argc, char **argv) { strcpybuff(httrack.path_bin, HTS_HTTRACKDIR); #endif + /* libhttrack-plugin DLL preload (libhttrack-plugin.so or libhttrack-plugin.dll) */ + { + void* userfunction = getFunctionPtr(&httrack, "libhttrack-plugin", "plugin_init"); + if (userfunction != NULL) { + t_hts_htmlcheck_init initFnc = (t_hts_htmlcheck_init) userfunction; + initFnc(); + set_wrappers(); /* Re-read wrappers internal static functions */ + } + } /* filter CR, LF, TAB.. */ { @@ -373,13 +394,16 @@ int main(int argc, char **argv) { } x_argvblk[0]='\0'; x_ptr=0; + + /* Create argv */ + x_argv = (char**) malloct(sizeof(char*) * ( argc + 1024 )); } /* Create new argc/argv, replace alias, count URLs, treat -h, -q, -i */ { - char _tmp_argv[2][HTS_CDLMAXSIZE]; + char BIGSTK _tmp_argv[2][HTS_CDLMAXSIZE]; + char BIGSTK tmp_error[HTS_CDLMAXSIZE]; char* tmp_argv[2]; - char tmp_error[HTS_CDLMAXSIZE]; int tmp_argc; int x_argc=0; int na; @@ -461,53 +485,6 @@ int main(int argc, char **argv) { argc=x_argc; } - - - - // Ici on ajoute les arguments de config -/* - if (fexist("config")) { // configuration - x_argvblk2=(char*) calloct(32768,1); - - if (x_argvblk2!=NULL) { - FILE* fp; - int x_argc2; - - //strcpybuff(x_argvblk2,"httrack "); - fp=fopen("config","rb"); - if (fp) { - linput(fp,x_argvblk2+strlen(x_argvblk2),32000); - fclose(fp); fp=NULL; - - // calculer arguments selon derniers arguments - x_argv2[0]=argv[0]; - x_argc2=1; - { - char* p=x_argvblk2; - do { - x_argv2[x_argc2++]=p; - p=strchr(p,' '); - if (p) { - *p=0; // octet nul (tableau) - p++; - } - } while(p!=NULL); - } - // recopier arguments actuels (pointeurs uniquement) - { - int na; - for(na=1;na= 2 && argv0[0]=='\"' && argv0[len-1]=='\"') { // "foo" - char tempo[1024]; - tempo[0] = '\0'; - strncatbuff(tempo, argv0+1, len-2); - strcpybuff(argv0, tempo); - } - */ argc=insert_after_argc+insert_after; insert_after++; } @@ -675,7 +639,11 @@ int main(int argc, char **argv) { #if DEBUG_STEPS printf("Checking cache\n"); #endif - if ( (!fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) || (!fexist(fconcat(httrack.path_log,"hts-cache/new.ndx"))) ) { + if (!fexist(fconcat(httrack.path_log,"hts-cache/new.zip"))) { + if ( fexist(fconcat(httrack.path_log,"hts-cache/old.zip")) ) { + rename(fconcat(httrack.path_log,"hts-cache/old.zip"),fconcat(httrack.path_log,"hts-cache/new.zip")); + } + } else if ( (!fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) || (!fexist(fconcat(httrack.path_log,"hts-cache/new.ndx"))) ) { if ( (fexist(fconcat(httrack.path_log,"hts-cache/old.dat"))) && (fexist(fconcat(httrack.path_log,"hts-cache/old.ndx"))) ) { remove(fconcat(httrack.path_log,"hts-cache/new.dat")); remove(fconcat(httrack.path_log,"hts-cache/new.ndx")); @@ -723,6 +691,11 @@ int main(int argc, char **argv) { remove(fconcat(httrack.path_log,"hts-err.txt")); if (fexist(fconcat(httrack.path_html,"index.html"))) remove(fconcat(httrack.path_html,"index.html")); + /* */ + if (fexist(fconcat(httrack.path_log,"hts-cache/new.zip"))) + remove(fconcat(httrack.path_log,"hts-cache/new.zip")); + if (fexist(fconcat(httrack.path_log,"hts-cache/old.zip"))) + remove(fconcat(httrack.path_log,"hts-cache/old.zip")); if (fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) remove(fconcat(httrack.path_log,"hts-cache/new.dat")); if (fexist(fconcat(httrack.path_log,"hts-cache/new.ndx"))) @@ -894,7 +867,11 @@ int main(int argc, char **argv) { #endif if (argv_url==0) { // Présence d'un cache, que faire?.. - if ((fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) && (fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer + if ( + ( fexist(fconcat(httrack.path_log,"hts-cache/new.zip")) ) + || + ( fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")) ) + ) { // il existe déja un cache précédent.. renommer if (fexist(fconcat(httrack.path_log,"hts-cache/doit.log"))) { // un cache est présent if (x_argvblk!=NULL) { int m; @@ -967,7 +944,11 @@ int main(int argc, char **argv) { httrack.cache=1; // cache prioritaire if (httrack.quiet==0) { - if ((fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) && (fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer + if ( + ( fexist(fconcat(httrack.path_log,"hts-cache/new.zip")) ) + || + ( fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")) ) + ) { HT_REQUEST_START; HT_PRINT("There is a lock-file in the directory "); HT_PRINT(httrack.path_log); @@ -985,7 +966,11 @@ int main(int argc, char **argv) { //char s[32]; httrack.cache=2; // cache vient après test de validité if (httrack.quiet==0) { - if ((fexist(fconcat(httrack.path_log,"hts-cache/new.dat"))) && (fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")))) { // il existe déja un cache précédent.. renommer + if ( + ( fexist(fconcat(httrack.path_log,"hts-cache/new.zip")) ) + || + ( fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/new.ndx")) ) + ) { HT_REQUEST_START; HT_PRINT("There is an index.html and a hts-cache folder in the directory "); HT_PRINT(httrack.path_log); @@ -1027,7 +1012,7 @@ int main(int argc, char **argv) { for(na=1;na 0 && (pos - a + 2) < sizeof(callbackname)) { char* posf = strchr(pos + 1, ':'); - char filename[1024]; + char BIGSTK filename[1024]; callbackname[0] = '\0'; strncatbuff(callbackname, a, pos - a); pos++; @@ -1442,38 +1428,38 @@ int main(int argc, char **argv) { filename[0] = '\0'; strncatbuff(filename, pos, posf - pos); posf++; - userfunction = getFunctionPtr(filename, posf); + userfunction = getFunctionPtr(&httrack, filename, posf); if (userfunction != NULL) { if ((void*)htswrap_read(callbackname) != NULL) { if (htswrap_add(callbackname, userfunction)) { - if (!httrack.quiet) { - set_wrappers(); - if ((void*)htswrap_read(callbackname) == userfunction) { - printf("successfully plugged [%s -> %s:%s]\n", callbackname, posf, filename); - } else { - char tmp[1024 * 2]; - sprintf(tmp, "option %%W : unable to (re)plug the function %s from the file %s for the callback %s", posf, filename, callbackname); - HTS_PANIC_PRINTF(tmp); - htsmain_free(); - return -1; + set_wrappers(); /* Re-read wrappers internal static functions */ + if ((void*)htswrap_read(callbackname) == userfunction) { + if (!httrack.quiet) { + fprintf(stderr, "successfully plugged [%s -> %s:%s]\n", callbackname, posf, filename); } + } else { + char BIGSTK tmp[1024 * 2]; + sprintf(tmp, "option %%W : unable to (re)plug the function %s from the file %s for the callback %s", posf, filename, callbackname); + HTS_PANIC_PRINTF(tmp); + htsmain_free(); + return -1; } } else { - char tmp[1024 * 2]; + char BIGSTK tmp[1024 * 2]; sprintf(tmp, "option %%W : unable to plug the function %s from the file %s for the callback %s", posf, filename, callbackname); HTS_PANIC_PRINTF(tmp); htsmain_free(); return -1; } } else { - char tmp[1024 * 2]; + char BIGSTK tmp[1024 * 2]; sprintf(tmp, "option %%W : unknown or undefined callback %s", callbackname); HTS_PANIC_PRINTF(tmp); htsmain_free(); return -1; } } else { - char tmp[1024 * 2]; + char BIGSTK tmp[1024 * 2]; sprintf(tmp, "option %%W : unable to load the function %s in the file %s for the callback %s", posf, filename, callbackname); HTS_PANIC_PRINTF(tmp); htsmain_free(); @@ -1494,6 +1480,39 @@ int main(int argc, char **argv) { } break; + case 'R': // Referer + if ((na+1>=argc) || (argv[na+1][0]=='-')) { + HTS_PANIC_PRINTF("Option %R needs to be followed by a blank space, and a referer URL"); + printf("Example: -%%R \"http://www.example.com/\"\n"); + htsmain_free(); + return -1; + } else{ + na++; + if (strlen(argv[na])>=254) { + HTS_PANIC_PRINTF("Referer URL too long"); + htsmain_free(); + return -1; + } + strcpybuff(httrack.referer, argv[na]); + } + break; + case 'E': // From Email address + if ((na+1>=argc) || (argv[na+1][0]=='-')) { + HTS_PANIC_PRINTF("Option %E needs to be followed by a blank space, and an email"); + printf("Example: -%%E \"postmaster@example.com\"\n"); + htsmain_free(); + return -1; + } else{ + na++; + if (strlen(argv[na])>=254) { + HTS_PANIC_PRINTF("From email too long"); + htsmain_free(); + return -1; + } + strcpybuff(httrack.from, argv[na]); + } + break; + default: { char s[HTS_CDLMAXSIZE]; sprintf(s,"invalid option %%%c\n",*com); @@ -1587,9 +1606,9 @@ int main(int argc, char **argv) { cache.hashtable=(void*)cache_hashtable; /* copy backcache hash */ cache.ro = 1; /* read only */ if (cache.hashtable) { - char adr[HTS_URLMAXSIZE*2]; - char fil[HTS_URLMAXSIZE*2]; - char url[HTS_URLMAXSIZE*2]; + char BIGSTK adr[HTS_URLMAXSIZE*2]; + char BIGSTK fil[HTS_URLMAXSIZE*2]; + char BIGSTK url[HTS_URLMAXSIZE*2]; char linepos[256]; int pos; char* cacheNdx = readfile(fconcat(httrack.path_log,"hts-cache/new.ndx")); @@ -1620,7 +1639,7 @@ int main(int argc, char **argv) { || (strjoker(url, filter, NULL, NULL) != NULL) ) { - r = cache_read(&httrack, &cache, adr, fil, "", NULL); // lire entrée cache + data + r = cache_read_ro(&httrack, &cache, adr, fil, "", NULL); // lire entrée cache + data if (r.statuscode != -1) { // No errors found++; if (!hasFilter) { @@ -1629,7 +1648,7 @@ int main(int argc, char **argv) { adr, fil); } else { char msg[256], cdate[256]; - char sav[HTS_URLMAXSIZE*2]; + char BIGSTK sav[HTS_URLMAXSIZE*2]; infostatuscode(msg, r.statuscode); time_gmt_rfc822(cdate); @@ -1713,6 +1732,14 @@ int main(int argc, char **argv) { return 0; } break; + case 'E': // extract cache + if (!hts_extract_meta(httrack.path_log)) { + fprintf(stderr, "* error extracting meta-data\n"); + return 1; + } + fprintf(stderr, "* successfully extracted meta-data\n"); + return 0; + break; case 'X': #ifndef STRDEBUG fprintf(stderr, "warning: no string debugging support built, option has no effect\n"); @@ -1720,6 +1747,34 @@ int main(int argc, char **argv) { htsMemoryFastXfr=1; if (*(com+1)=='0') { htsMemoryFastXfr=0; com++; } break; + case 'R': + { + char* name; + uLong repaired = 0; + uLong repairedBytes = 0; + if (fexist(fconcat(httrack.path_log,"hts-cache/new.zip"))) { + name = fconcat(httrack.path_log,"hts-cache/new.zip"); + } else if (fexist(fconcat(httrack.path_log,"hts-cache/old.zip"))) { + name = fconcat(httrack.path_log,"hts-cache/old.zip"); + } else { + fprintf(stderr, "* error: no cache found in %s\n", fconcat(httrack.path_log,"hts-cache/new.zip")); + return 1; + } + fprintf(stderr, "Cache: trying to repair %s\n", name); + if (unzRepair(name, + fconcat(httrack.path_log,"hts-cache/repair.zip"), + fconcat(httrack.path_log,"hts-cache/repair.tmp"), + &repaired, &repairedBytes + ) == Z_OK) { + unlink(name); + rename(fconcat(httrack.path_log,"hts-cache/repair.zip"), name); + fprintf(stderr,"Cache: %d bytes successfully recovered in %d entries\n", (int) repairedBytes, (int) repaired); + } else { + fprintf(stderr, "Cache: could not repair the cache\n"); + } + } + return 0; + break; case '~': /* internal lib test */ { char thisIsATestYouShouldSeeAnError[12]; @@ -1742,11 +1797,12 @@ int main(int argc, char **argv) { case 'T': httrack.maketrack=1; break; case 'u': sscanf(com+1,"%d",&httrack.waittime); while(isdigit((unsigned char)*(com+1))) com++; break; - case 'R': // ohh ftp, catch->ftpget + /*case 'R': // ohh ftp, catch->ftpget HTS_PANIC_PRINTF("Unexpected internal error with -#R command"); htsmain_free(); return -1; break; + */ case 'P': { // catchurl help_catchurl(httrack.path_log); htsmain_free(); @@ -1769,6 +1825,19 @@ int main(int argc, char **argv) { return 0; } break; + case '1': /* test #1 : fil_simplifie */ + if (na+1>=argc) { + HTS_PANIC_PRINTF("Option #1 needs to be followed by an URL"); + printf("Example: '-#0' ./foo/bar/../foobar\n"); + htsmain_free(); + return -1; + } else { + fil_simplifie(argv[na+1]); + printf("simplified=%s\n", argv[na+1]); + htsmain_free(); + return 0; + } + break; case '!': if (na+1>=argc) { HTS_PANIC_PRINTF("Option #! needs to be followed by a commandline"); @@ -1779,6 +1848,15 @@ int main(int argc, char **argv) { system(argv[na+1]); } break; + case 'd': + httrack.parsedebug = 1; + break; + + /* autotest */ + case 't': /* not yet implemented */ + fprintf(stderr, "** AUTOCHECK OK\n"); + exit(0); + break; default: printf("Internal option %c not recognized\n",*com); break; } @@ -1866,7 +1944,7 @@ int main(int argc, char **argv) { } // while } else { // URL/filters - char tempo[1024]; + char BIGSTK tempo[1024]; if (strnotempty(url)) strcatbuff(url," "); // espace de séparation strcpybuff(tempo,unescape_http_unharm(argv[na],1)); escape_spc_url(tempo); @@ -1895,7 +1973,7 @@ int main(int argc, char **argv) { //if (userdef) { if (!userid) { //if (strcmp(userdef->pw_name,"root")==0) { - char rpath[1024]; + char BIGSTK rpath[1024]; //printf("html=%s log=%s\n",httrack.path_html,httrack.path_log); // xxc if ((httrack.path_html[0]) && (httrack.path_log[0])) { char *a=httrack.path_html,*b=httrack.path_log,*c=NULL,*d=NULL; @@ -1913,7 +1991,7 @@ int main(int argc, char **argv) { strncatbuff(rpath,httrack.path_html,(int) (c - httrack.path_html)); } { - char tmp[1024]; + char BIGSTK tmp[1024]; strcpybuff(tmp,c); strcpybuff(httrack.path_html,tmp); strcpybuff(tmp,d); strcpybuff(httrack.path_log,tmp); } @@ -1971,7 +2049,19 @@ int main(int argc, char **argv) { // cad la version contenant le plus de fichiers if (httrack.cache) { if (fexist(fconcat(httrack.path_log,"hts-in_progress.lock"))) { // problemes.. - if (fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/new.ndx"))) { + if ( fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) ) { + if ( fexist(fconcat(httrack.path_log,"hts-cache/old.zip")) ) { + if (fsize(fconcat(httrack.path_log,"hts-cache/new.zip"))<32768) { + if (fsize(fconcat(httrack.path_log,"hts-cache/old.zip"))>65536) { + if (fsize(fconcat(httrack.path_log,"hts-cache/old.zip")) > fsize(fconcat(httrack.path_log,"hts-cache/new.zip"))) { + remove(fconcat(httrack.path_log,"hts-cache/new.zip")); + rename(fconcat(httrack.path_log,"hts-cache/old.zip"), fconcat(httrack.path_log,"hts-cache/new.zip")); + } + } + } + } + } + else if (fexist(fconcat(httrack.path_log,"hts-cache/new.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/new.ndx"))) { if (fexist(fconcat(httrack.path_log,"hts-cache/old.dat")) && fexist(fconcat(httrack.path_log,"hts-cache/old.ndx"))) { // switcher si new<32Ko et old>65Ko (tailles arbitraires) ? // ce cas est peut être une erreur ou un crash d'un miroir ancien, prendre @@ -2058,7 +2148,7 @@ int main(int argc, char **argv) { fprintf(fp,"and is used for updating this website."LF); fprintf(fp,"(The HTML website structure is stored here to allow fast updates)"LF""LF); fprintf(fp,"DO NOT delete this folder unless you do not want to update the mirror in the future!!"LF); - fprintf(fp,"(you can safely delete old.dat, old.ndx and old.lst files, however)"LF); + fprintf(fp,"(you can safely delete old.zip and old.lst files, however)"LF); fprintf(fp,""LF); fprintf(fp,HTS_LOG_SECURITY_WARNING); fclose(fp); @@ -2177,18 +2267,47 @@ int main(int argc, char **argv) { io_flush; + /* Enforce limits to avoid bandwith abuse. The bypass_limits should only be used by administrators and experts. */ + if (!httrack.bypass_limits) { + if (httrack.maxsoc <= 0 || httrack.maxsoc > 4) { + httrack.maxsoc = 4; + if (httrack.log != NULL) { + fspc(httrack.log,"warning"); fprintf(httrack.log,"* security warning: maximum number of simultaneous connections limited to %d to avoid server overload"LF, (int)httrack.maxsoc); + } + } + if (httrack.maxrate <= 0 || httrack.maxrate > 100000) { + httrack.maxrate = 100000; + if (httrack.log != NULL) { + fspc(httrack.log,"warning"); fprintf(httrack.log,"* security warning: maximum bandwidth limited to %d to avoid server overload"LF, (int)httrack.maxrate); + } + } + if (httrack.maxconn <= 0 || httrack.maxconn > 5.0) { + httrack.maxconn = 5.0; + if (httrack.log != NULL) { + fspc(httrack.log,"warning"); fprintf(httrack.log,"* security warning: maximum number of connections per second limited to %f to avoid server overload"LF, (float)httrack.maxconn); + } + } + } else { + if (httrack.log != NULL) { + fspc(httrack.log,"warning"); fprintf(httrack.log,"* security warning: !!! BYPASSING SECURITY LIMITS - MONITOR THIS SESSION WITH EXTREME CARE !!!"LF); + } + } + /* Info for wrappers */ if ( (httrack.debug>0) && (httrack.log!=NULL) ) { fspc(httrack.log,"info"); fprintf(httrack.log,"engine: init"LF); } #if HTS_ANALYSTE hts_htmlcheck_init(); + set_wrappers(); // init() is allowed to set other wrappers #endif // détourner SIGHUP etc. #if HTS_WIN +#ifndef _WIN32_WCE signal( SIGINT , sig_ask ); // ^C signal( SIGTERM , sig_finish ); // kill +#endif #else signal( SIGHUP , sig_back ); // close window signal( SIGTSTP , sig_back ); // ^Z @@ -2226,7 +2345,7 @@ deprecated - see SIGCHLD // // Build top index if (httrack.dir_topindex) { - char rpath[1024*2]; + char BIGSTK rpath[1024*2]; char* a; strcpybuff(rpath,httrack.path_html); if (rpath[0]) { @@ -2249,33 +2368,35 @@ deprecated - see SIGCHLD } } - /* Info for wrappers */ - if ( (httrack.debug>0) && (httrack.log!=NULL) ) { - fspc(httrack.log,"info"); fprintf(httrack.log,"engine: free"LF); - } + /* Info for wrappers */ + if ( (httrack.debug>0) && (httrack.log!=NULL) ) { + fspc(httrack.log,"info"); fprintf(httrack.log,"engine: free"LF); + } #if HTS_ANALYSTE - hts_htmlcheck_uninit(); + hts_htmlcheck_uninit(); #endif - + if (httrack_logmode!=1) { if (httrack.errlog == httrack.log) httrack.errlog=NULL; if (httrack.log) { fclose(httrack.log); httrack.log=NULL; } if (httrack.errlog) { fclose(httrack.errlog); httrack.errlog=NULL; } } - + // Débuggage des en têtes if (_DEBUG_HEAD) { if (ioinfo) { fclose(ioinfo); } } - + // supprimer lock remove(n_lock); } - + if (x_argvblk) freet(x_argvblk); + if (x_argv) + freet(x_argv); #if HTS_WIN #if HTS_ANALYSTE!=2 @@ -2315,7 +2436,7 @@ int check_path(char* s,char* defaultname) { if (strnotempty(s)) { if (s[(i=strlen(s))-1]=='#') { if (strnotempty((defaultname?defaultname:""))) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; char* a=strchr(defaultname,'#'); // we never know.. if (a) *a='\0'; tempo[0]='\0'; @@ -2339,7 +2460,7 @@ int check_path(char* s,char* defaultname) { // détermine si l'argument est une option int cmdl_opt(char* s) { if (s[0]=='-') { // c'est peut être une option - if (strchr(s,'.')!=NULL) + if (strchr(s,'.')!=NULL && strchr(s,'%')==NULL) return 0; // sans doute un -www.truc.fr (note: -www n'est pas compris) else if (strchr(s,'/')!=NULL) return 0; // idem, -*cgi-bin/ diff --git a/src/htscoremain.h b/src/htscoremain.h index 3662793..548c7f6 100644 --- a/src/htscoremain.h +++ b/src/htscoremain.h @@ -46,6 +46,8 @@ Please visit our Website: http://www.httrack.com #include "htsglobal.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE // Main, récupère les paramètres et appelle le robot #if HTS_ANALYSTE #ifndef HTTRACK_DEFLIB @@ -58,7 +60,7 @@ int main(int argc, char **argv); int cmdl_opt(char* s); int check_path(char* s,char* defaultname); - +#endif #endif diff --git a/src/htsdefines.h b/src/htsdefines.h index 0ab2cfa..e91b5b4 100644 --- a/src/htsdefines.h +++ b/src/htsdefines.h @@ -43,6 +43,7 @@ typedef void (* t_hts_htmlcheck_uninit)(void); typedef int (* t_hts_htmlcheck_start)(httrackp* opt); typedef int (* t_hts_htmlcheck_end)(void); typedef int (* t_hts_htmlcheck_chopt)(httrackp* opt); +typedef int (* t_hts_htmlcheck_process)(char** html,int* len,char* url_adresse,char* url_fichier); typedef int (* t_hts_htmlcheck)(char* html,int len,char* url_adresse,char* url_fichier); typedef char* (* t_hts_htmlcheck_query)(char* question); typedef char* (* t_hts_htmlcheck_query2)(char* question); @@ -52,11 +53,14 @@ typedef int (* t_hts_htmlcheck_check)(char* adr,char* fil,int status); typedef void (* t_hts_htmlcheck_pause)(char* lockfile); typedef void (* t_hts_htmlcheck_filesave)(char* file); typedef int (* t_hts_htmlcheck_linkdetected)(char* link); +typedef int (* t_hts_htmlcheck_linkdetected2)(char* link, char* tag_start); typedef int (* t_hts_htmlcheck_xfrstatus)(lien_back* back); typedef int (* t_hts_htmlcheck_savename)(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save); typedef int (* t_hts_htmlcheck_sendhead)(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* outgoing); typedef int (* t_hts_htmlcheck_receivehead)(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* incoming); +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE // demande d'interaction avec le shell #if HTS_ANALYSTE extern char HTbuff[2048]; @@ -65,6 +69,8 @@ extern t_hts_htmlcheck_uninit hts_htmlcheck_uninit; extern t_hts_htmlcheck_start hts_htmlcheck_start; extern t_hts_htmlcheck_end hts_htmlcheck_end; extern t_hts_htmlcheck_chopt hts_htmlcheck_chopt; +extern t_hts_htmlcheck_process hts_htmlcheck_preprocess; +extern t_hts_htmlcheck_process hts_htmlcheck_postprocess; extern t_hts_htmlcheck hts_htmlcheck; extern t_hts_htmlcheck_query hts_htmlcheck_query; extern t_hts_htmlcheck_query2 hts_htmlcheck_query2; @@ -74,6 +80,7 @@ extern t_hts_htmlcheck_check hts_htmlcheck_check; extern t_hts_htmlcheck_pause hts_htmlcheck_pause; extern t_hts_htmlcheck_filesave hts_htmlcheck_filesave; extern t_hts_htmlcheck_linkdetected hts_htmlcheck_linkdetected; +extern t_hts_htmlcheck_linkdetected2 hts_htmlcheck_linkdetected2; extern t_hts_htmlcheck_xfrstatus hts_htmlcheck_xfrstatus; extern t_hts_htmlcheck_savename hts_htmlcheck_savename; extern t_hts_htmlcheck_sendhead hts_htmlcheck_sendhead; @@ -102,3 +109,5 @@ extern t_hts_htmlcheck_receivehead hts_htmlcheck_receivehead; #endif +#endif + diff --git a/src/htsfilters.c b/src/htsfilters.c index be8b482..681b506 100644 --- a/src/htsfilters.c +++ b/src/htsfilters.c @@ -35,6 +35,9 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + // *.gif match all gif files // *[file]/*[file].exe match all exe files with one folder structure @@ -49,9 +52,6 @@ Please visit our Website: http://www.httrack.com /* specific definitions */ #include "htsbase.h" #include "htslib.h" -#include -#include -#include #include /* END specific definitions */ diff --git a/src/htsfilters.h b/src/htsfilters.h index 168d330..f963322 100644 --- a/src/htsfilters.h +++ b/src/htsfilters.h @@ -42,8 +42,11 @@ Please visit our Website: http://www.httrack.com #include "htsbase.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE int fa_strjoker(char** filters,int nfil,char* nom,LLint* size,int* size_flag,int* depth); HTS_INLINE char* strjoker(char* chaine,char* joker,LLint* size,int* size_flag); char* strjokerfind(char* chaine,char* joker); +#endif #endif diff --git a/src/htsftp.c b/src/htsftp.c index 68a8af5..7b04052 100644 --- a/src/htsftp.c +++ b/src/htsftp.c @@ -34,6 +34,9 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + // Gestion protocole ftp // Version .05 (01/2000) @@ -43,9 +46,6 @@ Please visit our Website: http://www.httrack.com #include "htsbase.h" #include "htsnet.h" #include "htsthread.h" -#include -#include -#include #if HTS_WIN #else //inet_ntoa @@ -55,9 +55,11 @@ Please visit our Website: http://www.httrack.com #if HTS_WIN #ifndef __cplusplus // DOS +#ifndef _WIN32_WCE #include /* _beginthread, _endthread */ #endif #endif +#endif // ftp mode passif // #if HTS_INET6==0 @@ -73,31 +75,10 @@ Please visit our Website: http://www.httrack.com #define FTP_STATUS_READY 1001 #if USE_BEGINTHREAD -/* -#ifdef __cplusplus -// C++ -> Shell -UINT back_launch_ftp( LPVOID pP ) { - lien_back* back=(lien_back*) pP; - if (back == NULL) { - //back->status=FTP_STATUS_READY; // fini - //back->r.statuscode=-1; - return -1; - } - - // lancer ftp - run_launch_ftp(back); - // prêt - back->status=0; - return 0; // thread completed successfully -} -#else -*/ -PTHREAD_TYPE back_launch_ftp( void* pP ) { +PTHREAD_TYPE PTHREAD_TYPE_FNC back_launch_ftp( void* pP ) { lien_back* back=(lien_back*) pP; if (back == NULL) { - //back->status=FTP_STATUS_READY; // fini - //back->r.statuscode=-1; #if FTP_DEBUG printf("[ftp error: no args]\n"); #endif @@ -113,27 +94,19 @@ PTHREAD_TYPE back_launch_ftp( void* pP ) { #endif run_launch_ftp(back); // prêt - back->status=0; + back->status=FTP_STATUS_READY; /* Uninitialize */ hts_uninit(); return PTHREAD_RETURN; } -/*#endif*/ // lancer en back void launch_ftp(lien_back* back) { -/* -#ifdef __cplusplus - // C++ -> Shell - AfxBeginThread(back_launch_ftp,(LPVOID) back); -#else -*/ // DOS #if FTP_DEBUG printf("[Launching main ftp thread]\n"); #endif - _beginthread(back_launch_ftp, 0, (void*) back); -/*#endif*/ + (void)hts_newthread(back_launch_ftp, 0, (void*) back); } #else @@ -142,7 +115,7 @@ int back_launch_ftp(lien_back* back) { // lancer ftp run_launch_ftp(back); // prêt - back->status=0; + back->status=FTP_STATUS_READY; return 0; } void launch_ftp(lien_back* back,char* path,char* exec) { @@ -213,7 +186,7 @@ int run_launch_ftp(lien_back* back) { #if FTP_PASV int port_pasv=0; #endif - char adr_ip[1024]; + char BIGSTK adr_ip[1024]; char *adr,*real_adr; char* ftp_filename=""; int timeout = 300; // timeout @@ -281,7 +254,11 @@ int run_launch_ftp(lien_back* back) { ftp_filename=a; if (strnotempty(a)) { char* ua=unescape_http(a); - if ( + int len_a = (int) strlen(ua); + if (len_a > 0 && ua[len_a -1] == '/') { /* obviously a directory listing */ + transfer_list=1; + sprintf(line_retr,"LIST -A %s",ua); + } else if ( (strchr(ua, ' ')) || (strchr(ua, '\"')) @@ -298,7 +275,7 @@ int run_launch_ftp(lien_back* back) { } } else { strcpybuff(back->r.msg,"Unexpected PORT error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } @@ -332,7 +309,7 @@ int run_launch_ftp(lien_back* back) { hp = hts_gethostbyname(_adr, &fullhostent_buffer); if (hp == NULL) { strcpybuff(back->r.msg,"Unable to get server's address"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-5; _HALT_FTP return 0; @@ -349,7 +326,7 @@ int run_launch_ftp(lien_back* back) { soc_ctl=socket(SOCaddr_sinfamily(server), SOCK_STREAM, 0); if (soc_ctl==INVALID_SOCKET) { strcpybuff(back->r.msg,"Unable to create a socket"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; _HALT_FTP return 0; @@ -367,7 +344,7 @@ int run_launch_ftp(lien_back* back) { if (connect(soc_ctl, (struct sockaddr *)&server, server_size) == -1) { #endif strcpybuff(back->r.msg,"Unable to connect to the server"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; _HALT_FTP return 0; @@ -379,7 +356,7 @@ int run_launch_ftp(lien_back* back) { _CHECK_HALT_FTP; { - char line[1024]; + char BIGSTK line[1024]; // envoi du login // --USER-- @@ -400,13 +377,23 @@ int run_launch_ftp(lien_back* back) { get_ftp_line(soc_ctl,line,timeout); _CHECK_HALT_FTP; if (line[0]=='2') { // ok + send_line(soc_ctl,"TYPE I"); + get_ftp_line(soc_ctl,line,timeout); + _CHECK_HALT_FTP; + if (line[0]=='2') { + // ok + } else { + strcpybuff(back->r.msg,"TYPE I error"); + // back->status=FTP_STATUS_READY; // fini + back->r.statuscode=-1; + } #if 0 // --CWD-- char* a; a=back->url_fil + strlen(back->url_fil)-1; while( (a > back->url_fil) && (*a!='/')) a--; if (*a == '/') { // ok repéré - char target[1024]; + char BIGSTK target[1024]; target[0]='\0'; strncatbuff(target,back->url_fil,(int) (a - back->url_fil)); if (strnotempty(target)==0) @@ -424,34 +411,34 @@ int run_launch_ftp(lien_back* back) { // ok.. } else { strcpybuff(back->r.msg,"TYPE I error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"CWD error: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { strcpybuff(back->r.msg,"Unexpected ftp error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } #endif } else { sprintf(back->r.msg,"Bad password: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"Bad user name: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"Connection refused: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } @@ -510,7 +497,7 @@ int run_launch_ftp(lien_back* back) { // -- fin analyse de l'adresse IP et du port -- } else { sprintf(back->r.msg,"PASV incorrect: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { @@ -542,12 +529,12 @@ int run_launch_ftp(lien_back* back) { } } else { sprintf(back->r.msg,"EPSV incorrect: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"PASV/EPSV error: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } @@ -663,7 +650,7 @@ int run_launch_ftp(lien_back* back) { deletesoc(soc_dat); soc_dat=INVALID_SOCKET; // sprintf(back->r.msg,"RETR command errror: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { @@ -673,22 +660,22 @@ int run_launch_ftp(lien_back* back) { deletesoc(soc_dat); soc_dat=INVALID_SOCKET; // strcpybuff(back->r.msg,"Unable to connect"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { strcpybuff(back->r.msg,"Unable to create a socket"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { sprintf(back->r.msg,"Unable to resolve IP %s",adr_ip); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts } else { sprintf(back->r.msg,"PASV incorrect: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } // sinon on est prêts #else @@ -711,17 +698,17 @@ int run_launch_ftp(lien_back* back) { int dummylen = sizeof(struct sockaddr); if ( (soc_dat=accept(soc_servdat,&dummyaddr,&dummylen)) == INVALID_SOCKET) { strcpybuff(back->r.msg,"Unable to accept connection"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"RETR command errror: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { sprintf(back->r.msg,"PORT command error: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } #if HTS_WIN @@ -731,7 +718,7 @@ int run_launch_ftp(lien_back* back) { #endif } else { strcpybuff(back->r.msg,"Unable to listen to a port"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } #endif @@ -747,7 +734,7 @@ int run_launch_ftp(lien_back* back) { back->r.fp = filecreate(back->url_sav); strcpybuff(back->info,"receiving"); if (back->r.fp != NULL) { - char buff[1024]; + char BIGSTK buff[1024]; int len=1; int read_len=1024; //HTS_TOTAL_RECV_CHECK(read_len); // Diminuer au besoin si trop de données reçues @@ -758,13 +745,13 @@ int run_launch_ftp(lien_back* back) { switch(wait_socket_receive(soc_dat,timeout)) { case -1: strcpybuff(back->r.msg,"FTP read error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; len=0; // fin break; case 0: sprintf(back->r.msg,"Time out (%d)",timeout); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; len=0; // fin break; @@ -785,17 +772,17 @@ int run_launch_ftp(lien_back* back) { } */ strcpybuff(back->r.msg,"Write error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; len=0; // error } } else { strcpybuff(back->r.msg,"Unexpected write error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { // Erreur ou terminé - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=0; if (back->r.totalsize > 0 && back->r.size != back->r.totalsize) { back->r.statuscode=-1; @@ -812,7 +799,7 @@ int run_launch_ftp(lien_back* back) { } } else { strcpybuff(back->r.msg,"Unable to write file"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } #if HTS_WIN @@ -828,16 +815,16 @@ int run_launch_ftp(lien_back* back) { get_ftp_line(soc_ctl,line,timeout); if (line[0]=='2') { // OK strcpybuff(back->r.msg,"OK"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=200; } else { sprintf(back->r.msg,"RETR incorrect: %s",linejmp(line)); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } else { strcpybuff(back->r.msg,"FTP read error"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; } } @@ -866,7 +853,7 @@ int run_launch_ftp(lien_back* back) { back->r.statuscode=200; strcpybuff(back->r.msg,"OK"); } - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini return 0; } @@ -976,7 +963,7 @@ FILE* dd=NULL; // routines de réception/émission // 0 = ERROR int send_line(T_SOC soc,char* data) { - char line[1024]; + char BIGSTK line[1024]; if (_DEBUG_HEAD) { if (ioinfo) { fprintf(ioinfo,"---> %s\x0d\x0a",data); @@ -1007,7 +994,7 @@ int send_line(T_SOC soc,char* data) { } int get_ftp_line(T_SOC soc,char* line,int timeout) { - char data[1024]; + char BIGSTK data[1024]; int i,ok,multiline; #if FTP_DEBUG if (dd == NULL) dd = fopen("toto.txt","w"); @@ -1152,7 +1139,7 @@ int wait_socket_receive(T_SOC soc,int timeout) { int stop_ftp(lien_back* back) { if (back->stop_ftp) { strcpybuff(back->r.msg,"Cancelled by User"); - back->status=FTP_STATUS_READY; // fini + // back->status=FTP_STATUS_READY; // fini back->r.statuscode=-1; return 1; } diff --git a/src/htsftp.h b/src/htsftp.h index e24f1f3..08ab784 100644 --- a/src/htsftp.h +++ b/src/htsftp.h @@ -45,9 +45,11 @@ Please visit our Website: http://www.httrack.com // lien_back #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE #if USE_BEGINTHREAD void launch_ftp(lien_back* back); -PTHREAD_TYPE back_launch_ftp( void* pP ); +PTHREAD_TYPE PTHREAD_TYPE_FNC back_launch_ftp( void* pP ); #else void launch_ftp(lien_back* back,char* path,char* exec); int back_launch_ftp(lien_back* back); @@ -62,7 +64,7 @@ char* linejmp(char* line); int check_socket(T_SOC soc); int check_socket_connect(T_SOC soc); int wait_socket_receive(T_SOC soc,int timeout); - +#endif #endif diff --git a/src/htsglobal.h b/src/htsglobal.h index 38faebc..d045f14 100644 --- a/src/htsglobal.h +++ b/src/htsglobal.h @@ -40,20 +40,45 @@ Please visit our Website: http://www.httrack.com #define HTTRACK_GLOBAL_DEFH // Version -#define HTTRACK_VERSION "3.30" -#define HTTRACK_VERSIONID "3.30.01" +#define HTTRACK_VERSION "3.33-2" +#define HTTRACK_VERSIONID "3.33.16" #define HTTRACK_AFF_VERSION "3.x" //#define HTTRACK_AFF_WARNING "This is a BETA release of WinHTTrack Website Copier ("HTTRACK_VERSION")\nPlease report any crashes, bugs or problems" - +#ifndef HTS_NOINCLUDES +#ifndef _WIN32_WCE +#include +#include +#else +#include +#include +#ifdef HTS_CECOMPAT +#include "cecompat.h" +#else +#include "celib.h" +#endif +#endif +#endif // Définition plate-forme #include "htssystem.h" #include "htsconfig.h" +// WIN32 types +#ifdef _WIN32 +#ifndef SIZEOF_LONG +#define SIZEOF_LONG 4 +#define SIZEOF_LONG_LONG 8 +#endif +#endif + + // config.h #ifdef _WIN32 +// WIN32 +#ifndef _WIN32_WCE + #define HAVE_SYS_STAT_H 1 #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 @@ -69,6 +94,35 @@ Please visit our Website: http://www.httrack.com #else +// Win32CE +//#pragma runtime_checks( "s", restore ) +#define HTS_SPARE_MEMORY 1 +#define HTS_ALIGN 8 +#define BIGSTK static +#undef DLLIB // LoadLibrary(libssl) crashes +#define NOSTRDEBUG 1 +#undef HTS_MAKE_KEYWORD_INDEX +#ifdef HTS_CECOMPAT +#define HTS_DO_NOT_USE_FTIME 1 +#undef HAVE_SYS_STAT_H +#undef HAVE_SYS_TYPES_H +#else +#undef HTS_DO_NOT_USE_FTIME +#define HAVE_SYS_STAT_H 1 +#define HAVE_SYS_TYPES_H 1 +#endif + +#define HTS_DLOPEN 0 +#undef HTS_INET6 +#ifndef S_ISREG +#define S_ISREG(m) ((m) & _S_IFREG) + +#endif + +#endif + +#else + #include "config.h" #ifndef FTIME @@ -110,7 +164,6 @@ Please visit our Website: http://www.httrack.com #endif - // Socket windows ou socket unix #ifdef _WIN32 #undef HTS_PLATFORM @@ -126,6 +179,15 @@ Please visit our Website: http://www.httrack.com #endif #endif +// don't spare memory usage by default +#ifndef HTS_SPARE_MEMORY +#define HTS_SPARE_MEMORY 0 +#endif + +#ifndef BIGSTK +#define BIGSTK +#endif + // compatibilité DOS #if HTS_WIN #define HTS_DOSNAME 1 @@ -208,14 +270,24 @@ Please visit our Website: http://www.httrack.com #endif +#if HTS_SPARE_MEMORY==0 /* Gestion des tables de hashage */ #define HTS_HASH_SIZE 20147 /* Taille max d'une URL */ #define HTS_URLMAXSIZE 1024 /* Taille max ligne de commande (>=HTS_URLMAXSIZE*2) */ #define HTS_CDLMAXSIZE 1024 +#else +/* Gestion des tables de hashage */ +#define HTS_HASH_SIZE 1023 +/* Taille max d'une URL */ +#define HTS_URLMAXSIZE 256 +/* Taille max ligne de commande (>=HTS_URLMAXSIZE*2) */ +#define HTS_CDLMAXSIZE 1024 +#endif + /* Copyright (C) Xavier Roche and other contributors */ -#define HTTRACK_AFF_AUTHORS "[XR&CO'2003]" +#define HTTRACK_AFF_AUTHORS "[XR&CO'2005]" #define HTS_DEFAULT_FOOTER "" #define HTTRACK_WEB "http://www.httrack.com" #define HTS_UPDATE_WEBSITE "http://www.httrack.com/update.php3?Product=HTTrack&Version="HTTRACK_VERSIONID"&VersionStr="HTTRACK_VERSION"&Platform=%d&Language=%s" @@ -357,7 +429,11 @@ typedef int INTsys; #define LOCAL_SOCKET_ID -500000 // taille de chaque buffer (10 sockets 650 ko) -#define TAILLE_BUFFER 65535 +#if HTS_SPARE_MEMORY==0 +#define TAILLE_BUFFER 65536 +#else +#define TAILLE_BUFFER 8192 +#endif #if HTS_WIN #else @@ -390,11 +466,11 @@ typedef int INTsys; //#define HTS_TRACE_MALLOC #ifdef HTS_TRACE_MALLOC typedef unsigned long int t_htsboundary; -typedef struct _mlink { +typedef struct mlink { char* adr; int len; int id; - struct _mlink* next; + struct mlink* next; } mlink; static const t_htsboundary htsboundary = 0xDEADBEEF; #endif @@ -449,5 +525,22 @@ static const t_htsboundary htsboundary = 0xDEADBEEF; // htsmain #define DEBUG_STEPS 0 + +// Débuggage de contrôle +#if HTS_DEBUG_CLOSESOCK +#define _HTS_WIDE 1 +#endif +#if HTS_WIDE_DEBUG +#define _HTS_WIDE 1 +#endif +#if _HTS_WIDE +extern FILE* DEBUG_fp; +#define DEBUG_W(A) { if (DEBUG_fp==NULL) DEBUG_fp=fopen("bug.out","wb"); fprintf(DEBUG_fp,":>"A); fflush(DEBUG_fp); } +#undef _ +#define _ , +#endif + + + #endif diff --git a/src/htshash.c b/src/htshash.c index 3cbdb5f..38a2d64 100644 --- a/src/htshash.c +++ b/src/htshash.c @@ -35,15 +35,15 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htshash.h" /* specific definitions */ #include "htsbase.h" #include "htsglobal.h" #include "htsmd5.h" -#include -#include -#include /* END specific definitions */ /* Specific macros */ @@ -63,7 +63,7 @@ Please visit our Website: http://www.httrack.com // recherche dans la table selon nom1,nom2 et le no d'enregistrement // retour: position ou -1 si non trouvé int hash_read(hash_struct* hash,char* nom1,char* nom2,int type,int normalized) { - char normfil_[HTS_URLMAXSIZE*2]; + char BIGSTK normfil_[HTS_URLMAXSIZE*2]; char* normfil; char* normadr; unsigned int cle; @@ -199,7 +199,7 @@ int hash_read(hash_struct* hash,char* nom1,char* nom2,int type,int normalized) { // enregistrement lien lpos dans les 3 tables hash1..3 void hash_write(hash_struct* hash,int lpos,int normalized) { - char normfil_[HTS_URLMAXSIZE*2]; + char BIGSTK normfil_[HTS_URLMAXSIZE*2]; char* normfil; unsigned int cle; int pos; diff --git a/src/htshash.h b/src/htshash.h index c4acff1..43b5003 100644 --- a/src/htshash.h +++ b/src/htshash.h @@ -42,10 +42,13 @@ Please visit our Website: http://www.httrack.com #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE // tables de hashage int hash_read(hash_struct* hash,char* nom1,char* nom2,int type,int normalized); void hash_write(hash_struct* hash,int lpos,int normalized); int* hash_calc_chaine(hash_struct* hash,int type,int pos); unsigned long int hash_cle(char* nom1,char* nom2); +#endif #endif diff --git a/src/htshelp.c b/src/htshelp.c index 7046929..af6f742 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -35,6 +35,9 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htshelp.h" /* specific definitions */ @@ -43,9 +46,6 @@ Please visit our Website: http://www.httrack.com #include "htscatchurl.h" #include "htslib.h" #include "htsalias.h" -#include -#include -#include #if HTS_WIN #else #ifdef HAVE_UNISTD_H @@ -334,20 +334,20 @@ int help_query(char* list,int def) { // Capture d'URL void help_catchurl(char* dest_path) { - char adr_prox[HTS_URLMAXSIZE*2]; + char BIGSTK adr_prox[HTS_URLMAXSIZE*2]; int port_prox; T_SOC soc=catch_url_init_std(&port_prox,adr_prox); if (soc!=INVALID_SOCKET) { - char url[HTS_URLMAXSIZE*2]; + char BIGSTK url[HTS_URLMAXSIZE*2]; char method[32]; - char data[32768]; + char BIGSTK data[32768]; url[0]=method[0]=data[0]='\0'; // printf("Okay, temporary proxy installed.\nSet your browser's preferences to:\n\n"); printf("\tProxy's address: \t%s\n\tProxy's port: \t%d\n",adr_prox,port_prox); // if (catch_url(soc,url,method,data)) { - char dest[HTS_URLMAXSIZE*2]; + char BIGSTK dest[HTS_URLMAXSIZE*2]; int i=0; do { sprintf(dest,"%s%s%d",dest_path,"hts-post",i); @@ -362,7 +362,7 @@ void help_catchurl(char* dest_path) { } // former URL! { - char finalurl[HTS_URLMAXSIZE*2]; + char BIGSTK finalurl[HTS_URLMAXSIZE*2]; escape_check_url(dest); sprintf(finalurl,"%s"POSTTOK"file:%s",url,dest); printf("\nThe URL is: \"%s\"\n",finalurl); @@ -471,7 +471,7 @@ void help(char* app,int more) { infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)"); infomsg(" u check document type if unknown (cgi,asp..) (u0 don't check, * u1 check but /, u2 check always)"); infomsg(" j *parse Java Classes (j0 don't parse)"); - infomsg(" sN follow robots.txt and meta robots tags (0=never,1=sometimes,* 2=always)"); + infomsg(" sN follow robots.txt and meta robots tags (0=never,1=sometimes,* 2=always, 3=always (even strict rules))"); infomsg(" %h force HTTP/1.0 requests (reduce update features, only for old servers or proxies)"); infomsg(" %k use keep-alive if possible, greately reducing latency for small files and test requests (%k0 don't use)"); infomsg(" %B tolerant requests (accept bogus responses on some servers, but not standard!)"); @@ -479,10 +479,13 @@ void help(char* app,int more) { infomsg(" %u url hacks: various hacks to limit duplicate URLs (strip //, www.foo.com==foo.com..)"); infomsg(" %A assume that a type (cgi,asp..) is always linked with a mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip)"); infomsg(" shortcut: '--assume standard' is equivalent to -%A "HTS_ASSUME_STANDARD); + infomsg(" can also be used to force a specific file type: --assume foo.cgi=text/html"); infomsg(" @iN internet protocol (0=both ipv6+ipv4, 4=ipv4 only, 6=ipv6 only)"); infomsg(""); infomsg("Browser ID:"); - infomsg(" F user-agent field (-F \"user-agent name\")"); + infomsg(" F user-agent field sent in HTTP headers (-F \"user-agent name\")"); + infomsg(" %R default referer field sent in HTTP headers"); + infomsg(" %E from email address sent in HTTP headers"); infomsg(" %F footer string in Html code (-%F \"Mirrored [from host %s [file %s [at %s]]]\""); infomsg(" %l preffered language (-%l \"fr, en, jp, *\""); infomsg(""); @@ -490,7 +493,7 @@ void help(char* app,int more) { infomsg(" C create/use a cache for updates and retries (C0 no cache,C1 cache is prioritary,* C2 test update before)"); infomsg(" k store all files in cache (not useful if files on disk)"); infomsg(" %n do not re-download locally erased files"); - infomsg(" %v display on screen filenames downloaded (in realtime) - * %v1 short version"); + infomsg(" %v display on screen filenames downloaded (in realtime) - * %v1 short version - %v2 full animation"); infomsg(" Q no log - quiet mode"); infomsg(" q no questions - quiet mode"); infomsg(" z log - extra infos"); @@ -523,6 +526,9 @@ void help(char* app,int more) { infomsg(" #X *use optimized engine (limited memory boundary checks)"); infomsg(" #0 filter test (-#0 '*.gif' 'www.bar.com/foo.gif')"); infomsg(" #C cache list (-#C '*.com/spider*.gif'"); + infomsg(" #R cache repair (damaged cache)"); + infomsg(" #d debug parser"); + infomsg(" #E extract new.zip cache meta-data in meta.zip"); infomsg(" #f always flush log files"); infomsg(" #FN maximum number of filters"); infomsg(" #h version info"); @@ -536,10 +542,15 @@ void help(char* app,int more) { infomsg(" #Z generate transfer rate statictics every minutes"); infomsg(" #! execute a shell command (-#! \"echo hello\")"); infomsg(""); + infomsg("Dangerous options: (do NOT use unless you exactly know what you are doing)"); + infomsg(" %! bypass built-in security limits aimed to avoid bandwith abuses (bandwidth, simultaneous connections)"); + infomsg(" IMPORTANT NOTE: DANGEROUS OPTION, ONLY SUITABLE FOR EXPERTS"); + infomsg(" USE IT WITH EXTREME CARE"); + infomsg(""); infomsg("Command-line specific options:"); infomsg(" V execute system command after each files ($0 is the filename: -V \"rm \\$0\")"); infomsg(" %U run the engine with another id when called as root (-%U smith)"); - infomsg(" %W use an external library function as a wrapper (-%W link-detected=foo.so:myfunction)"); + infomsg(" %W use an external library function as a wrapper (-%W link-detected=foo.so:myfunction[,myparameters])"); /* infomsg(" %O do a chroot before setuid"); */ infomsg(""); infomsg("Details: Option N"); @@ -571,6 +582,7 @@ void help(char* app,int more) { infomsg(" '%h' Host name (ex: www.someweb.com)"); infomsg(" '%M' URL MD5 (128 bits, 32 ascii bytes)"); infomsg(" '%Q' query string MD5 (128 bits, 32 ascii bytes)"); + infomsg(" '%r' protocol name (ex: http)"); infomsg(" '%q' small query string MD5 (16 bits, 4 ascii bytes)"); infomsg(" '%s?' Short name version (ex: %sN)"); infomsg(" '%[param]' param variable in query string"); @@ -613,6 +625,8 @@ void help(char* app,int more) { infomsg("'start' : int (* myfunction)(httrackp* opt);"); infomsg("'end' : int (* myfunction)(void);"); infomsg("'change-options' : int (* myfunction)(httrackp* opt);"); + infomsg("'preprocess-html' : int (* myfunction)(char** html,int* len,char* url_adresse,char* url_fichier);"); + infomsg("'postprocess-html' : int (* myfunction)(char** html,int* len,char* url_adresse,char* url_fichier);"); infomsg("'check-html' : int (* myfunction)(char* html,int len,char* url_adresse,char* url_fichier);"); infomsg("'query' : char* (* myfunction)(char* question);"); infomsg("'query2' : char* (* myfunction)(char* question);"); @@ -622,8 +636,10 @@ void help(char* app,int more) { infomsg("'pause' : void (* myfunction)(char* lockfile);"); infomsg("'save-file' : void (* myfunction)(char* file);"); infomsg("'link-detected' : int (* myfunction)(char* link);"); + infomsg("'link-detected2' : int (* myfunction)(char* link, char* start_tag);"); infomsg("'transfer-status' : int (* myfunction)(lien_back* back);"); infomsg("'save-name' : int (* myfunction)(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save);"); + infomsg("And _init() functions if defined, called upon plug"); infomsg(""); infomsg(""); infomsg("example: httrack www.someweb.com/bob/"); diff --git a/src/htshelp.h b/src/htshelp.h index 924a526..67354c7 100644 --- a/src/htshelp.h +++ b/src/htshelp.h @@ -43,11 +43,14 @@ Please visit our Website: http://www.httrack.com #include "htsglobal.h" #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE void infomsg(char* msg); void help(char* app,int more); void make_empty_index(char* str); void help_wizard(httrackp* opt); int help_query(char* list,int def); void help_catchurl(char* dest_path); +#endif #endif diff --git a/src/htsindex.c b/src/htsindex.c index 1a75103..af87396 100644 --- a/src/htsindex.c +++ b/src/htsindex.c @@ -35,9 +35,10 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + -#include -#include #include "htsindex.h" #include "htsglobal.h" #include "htslib.h" @@ -124,12 +125,14 @@ int hts_primindex_words=0; */ void index_init(const char* indexpath) { #if HTS_MAKE_KEYWORD_INDEX +#ifndef _WIN32_WCE /* remove(concat(indexpath,"index.txt")); */ hts_index_init=1; hts_primindex_size=0; hts_primindex_words=0; fp_tmpproject=tmpfile(); #endif +#endif } @@ -298,7 +301,7 @@ int index_keyword(const char* html_data,LLint size,const char* mime,const char* unsigned long int e=0; if (inthash_read(WordIndexHash,line,&e)) { //if (e) { - char savelst[HTS_URLMAXSIZE*2]; + char BIGSTK savelst[HTS_URLMAXSIZE*2]; e++; /* 0 means "once" */ if (strncmp((const char*)fslash((char*)indexpath),filename,strlen(indexpath))==0) // couper diff --git a/src/htsindex.h b/src/htsindex.h index 40a189b..b773034 100644 --- a/src/htsindex.h +++ b/src/htsindex.h @@ -41,8 +41,11 @@ Please visit our Website: http://www.httrack.com #include "htsglobal.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE int index_keyword(const char* html_data,LLint size,const char* mime,const char* filename,const char* indexpath); void index_init(const char* indexpath); void index_finish(const char* indexpath,int mode); +#endif #endif diff --git a/src/htsinthash.c b/src/htsinthash.c index 95b8711..eb155cb 100644 --- a/src/htsinthash.c +++ b/src/htsinthash.c @@ -35,15 +35,15 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsinthash.h" /* specific definitions */ #include "htsbase.h" #include "htsglobal.h" #include "htsmd5.h" -#include -#include -#include /* END specific definitions */ /* Specific macros */ @@ -68,11 +68,12 @@ int inthash_write(inthash hashtable,char* name,long int value) { if (strcmp(h->name,name)==0) { /* Delete element */ if (hashtable->flag_valueismalloc) { - if (h->value.intg) { + void* ptr = (void*)h->value.intg; + if (ptr != NULL) { if (hashtable->free_handler) - hashtable->free_handler((void*)h->value.intg); + hashtable->free_handler(ptr); else - freet((void*)h->value.intg); + freet(ptr); } } /* Insert */ @@ -151,7 +152,8 @@ int inthash_read(inthash hashtable,char* name,long int* value) { inthash_chain* h=hashtable->hash[pos]; while (h) { if (strcmp(h->name,name)==0) { - *value=h->value.intg; + if (value != NULL) + *value=h->value.intg; return 1; } h=h->next; @@ -180,12 +182,13 @@ void inthash_delchain(inthash_chain* hash,t_inthash_freehandler free_handler) { inthash_delchain(hash->next,free_handler); if (free_handler) { // pos is a malloc() block, delete it! if (hash->value.intg) { + void* ptr = (void*)hash->value.intg; if (free_handler) - free_handler((void*)hash->value.intg); + free_handler(ptr); else - freet((void*)hash->value.intg); + freet(ptr); + hash->value.intg=0; } - hash->value.intg=0; } freet(hash); } diff --git a/src/htsinthash.h b/src/htsinthash.h index c667cd4..5d7b992 100644 --- a/src/htsinthash.h +++ b/src/htsinthash.h @@ -54,7 +54,7 @@ typedef struct inthash_chain { // structure behind inthash typedef void (* t_inthash_freehandler)(void* value); -typedef struct { +typedef struct struct_inthash { inthash_chain** hash; t_inthash_freehandler free_handler; unsigned int hash_size; @@ -64,6 +64,8 @@ typedef struct { // main inthash type typedef struct_inthash* inthash; +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE // subfunctions unsigned long int inthash_key(char* value); void inthash_init(inthash hashtable); @@ -72,7 +74,6 @@ void inthash_default_free_handler(void* value); // main functions: - /* Hash functions: */ inthash inthash_new(int size); /* Create a new hash table */ int inthash_created(inthash hashtable); /* Test if the hash table was successfully created */ @@ -89,6 +90,6 @@ void* inthash_addblk(inthash hashtable,char* name,int blksize); /* Add entr int inthash_write(inthash hashtable,char* name,long int value); /* Overwrite/add entry in the hash table */ int inthash_inc(inthash hashtable,char* name); /* Increment entry in the hash table */ /* End of hash functions: */ - +#endif #endif diff --git a/src/htsjava.c b/src/htsjava.c index afb166b..3536b9b 100644 --- a/src/htsjava.c +++ b/src/htsjava.c @@ -35,6 +35,9 @@ Please visit our Website: http://www.httrack.com /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + /* Version: Oct/2000 */ /* Fixed: problems with class structure (10/2000) */ @@ -46,10 +49,6 @@ Please visit our Website: http://www.httrack.com #include "htsjava.h" -#include -#include -#include - #include "htsnostatic.h" //#include @@ -186,7 +185,7 @@ int hts_parse_java(htsmoduleStruct* str) if((tab[i].index1!=SClass) && (tab[i].index1!=Class) && (tab[tab[i].index1].name[0]!='[')) { if(!strstr(tab[tab[i].index1].name,"java/")) { - char tempo[1024]; + char BIGSTK tempo[1024]; tempo[0]='\0'; sprintf(tempo,"%s.class",tab[tab[i].index1].name); @@ -289,7 +288,7 @@ RESP_STRUCT readtable(htsmoduleStruct* str, strcpybuff(trans.name,"HTS_UNICODE"); { - char buffer[1024]; + char BIGSTK buffer[1024]; char *p; p=&buffer[0]; diff --git a/src/htsjava.h b/src/htsjava.h index b3d17d4..915824b 100644 --- a/src/htsjava.h +++ b/src/htsjava.h @@ -57,6 +57,8 @@ typedef struct { } RESP_STRUCT; +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE int hts_detect_java(htsmoduleStruct* str); int hts_parse_java(htsmoduleStruct* str); RESP_STRUCT affecte(int i1,int i2,RESP_STRUCT *i3,RESP_STRUCT *i4,int i5); @@ -65,6 +67,6 @@ RESP_STRUCT readtable(htsmoduleStruct* str,FILE *fp,RESP_STRUCT,int*); unsigned short int readshort(FILE *fp); int tris(char*); char * printname(char [1024]); - +#endif #endif diff --git a/src/htslib.c b/src/htslib.c index 3954f9c..9c389c8 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -34,11 +34,20 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + // Fichier librairie .c #include "htslib.h" #include "htsbauth.h" +#ifdef _WIN32_WCE +#ifndef HTS_CECOMPAT +#pragma comment(lib, "celib.lib") //link with celib +#endif +#endif + /* specific definitions */ #include "htsbase.h" #include "htsnet.h" @@ -46,9 +55,11 @@ Please visit our Website: http://www.httrack.com #include "htsthread.h" #include "htsnostatic.h" #include "htswrap.h" -#include +#include "htsmd5.h" #if HTS_WIN +#ifndef _WIN32_WCE #include +#endif #else #ifdef HAVE_SYS_TYPES_H #include @@ -60,32 +71,39 @@ Please visit our Website: http://www.httrack.com #include #endif #endif -#include #include #include +#ifndef _WIN32_WCE +#include +#else +#ifndef HTS_CECOMPAT #include +#endif +#endif +#ifndef _WIN32_WCE #include +#endif // pour utimbuf #if HTS_WIN +#ifndef _WIN32_WCE +#include +#else +#ifndef HTS_CECOMPAT #include +#endif +#endif #else #include #endif +#ifndef _WIN32_WCE +#include +#endif /* END specific definitions */ - -// Débuggage de contrôle -#if HTS_DEBUG_CLOSESOCK -#define _HTS_WIDE 1 -#endif -#if HTS_WIDE_DEBUG -#define _HTS_WIDE 1 -#endif +// Debugging #if _HTS_WIDE FILE* DEBUG_fp=NULL; -#define DEBUG_W(A) { if (DEBUG_fp==NULL) DEBUG_fp=fopen("bug.out","wb"); fprintf(DEBUG_fp,":>"A); fflush(DEBUG_fp); } -#define DEBUG_W2(A) { if (DEBUG_fp==NULL) DEBUG_fp=fopen("bug.out","wb"); fprintf(DEBUG_fp,A); fflush(DEBUG_fp); } #endif /* variables globales */ @@ -553,6 +571,7 @@ const char* hts_mime[][2] = { || CIS(c,'*') \ || CIS(c,'\'') \ || CIS(c,'\"') \ + || CIS(c,'&') \ || CIS(c,'!') ) //#define CHAR_XXAVOID(c) ( strchr(" *'\"!",(unsigned char)(c)) != 0 ) #define CHAR_MARK(c) ( CIS(c,'-') \ @@ -581,7 +600,9 @@ char* antislash(char* s) { } #endif - +#ifdef _WIN32_WCE +char cwd[MAX_PATH+1] = ""; +#endif // Récupération d'un fichier http sur le net. // Renvoie une adresse sur le bloc de mémoire, ou bien @@ -592,8 +613,8 @@ char* antislash(char* s) { // en background htsblk httpget(char* url) { - char adr[HTS_URLMAXSIZE*2]; // adresse - char fil[HTS_URLMAXSIZE*2]; // chemin + char BIGSTK adr[HTS_URLMAXSIZE*2]; // adresse + char BIGSTK fil[HTS_URLMAXSIZE*2]; // chemin // séparer URL en adresse+chemin if (ident_url_absolute(url,adr,fil)==-1) { @@ -692,7 +713,7 @@ int http_xfopen(int mode,int treat,int waitconnect,char* xsend,char* adr,char* f // Test en cas de file:///C|... if (!fexist(fconv(unescape_http(fil)))) if (fexist(fconv(unescape_http(fil+1)))) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; strcpybuff(tempo,fil+1); strcpybuff(fil,tempo); } @@ -802,7 +823,7 @@ int http_xfopen(int mode,int treat,int waitconnect,char* xsend,char* adr,char* f // envoi d'une requète int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char* referer_adr,char* referer_fil,htsblk* retour) { - char buff[8192]; + char BIGSTK buff[8192]; //int use_11=0; // HTTP 1.1 utilisé int direct_url=0; // ne pas analyser l'url (exemple: ftp://) char* search_tag=NULL; @@ -826,8 +847,8 @@ int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char if (mode==0) { // GET! FILE* fp=fopen(unescape_http(search_tag+strlen(POSTTOK)+5),"rb"); if (fp) { - char line[1100]; - char protocol[256],url[HTS_URLMAXSIZE*2],method[256]; + char BIGSTK line[1100]; + char BIGSTK protocol[256],url[HTS_URLMAXSIZE*2],method[256]; linput(fp,line,1000); if (sscanf(line,"%s %s %s",method,url,protocol) == 3) { // selon que l'on a ou pas un proxy @@ -879,7 +900,7 @@ int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char // on slash doit être présent en début, sinon attention aux bad request! (400) if (*fil!='/') strcatbuff(buff,"/"); { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; if (search_tag) strncatbuff(tempo,fil,(int) (search_tag - fil)); @@ -923,25 +944,31 @@ int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char } // Referer? - if ((referer_adr) && (referer_fil)) { // existe - if ((strnotempty(referer_adr)) && (strnotempty(referer_fil))) { // non vide - if ( - (strcmp(referer_adr,"file://") != 0) - && - ( /* no https referer to http urls */ - (strncmp(referer_adr, "https://", 8) != 0) /* referer is not https */ - || - (strncmp(adr, "https://", 8) == 0) /* or referer AND addresses are https */ - ) - ) { // PAS file:// - strcatbuff(buff,"Referer: "); - strcatbuff(buff,"http://"); - strcatbuff(buff,jump_identification(referer_adr)); - strcatbuff(buff,referer_fil); - strcatbuff(buff,H_CRLF); - } + if (referer_adr != NULL && referer_fil != NULL + && strnotempty(referer_adr) && strnotempty(referer_fil) + ) { // non vide + if ( + (strcmp(referer_adr,"file://") != 0) + && + ( /* no https referer to http urls */ + (strncmp(referer_adr, "https://", 8) != 0) /* referer is not https */ + || + (strncmp(adr, "https://", 8) == 0) /* or referer AND addresses are https */ + ) + ) { // PAS file:// + strcatbuff(buff,"Referer: "); + strcatbuff(buff,"http://"); + strcatbuff(buff,jump_identification(referer_adr)); + strcatbuff(buff,referer_fil); + strcatbuff(buff,H_CRLF); } } + // HTTP field: referer + else if (retour->req.referer[0] != '\0') { + strcatbuff(buff,"Referer: "); + strcatbuff(buff, retour->req.referer); + strcatbuff(buff, H_CRLF); + } // POST? if (mode==0) { // GET! @@ -1002,6 +1029,13 @@ int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char } //} + // HTTP field: from + if (retour->req.from[0] != '\0') { // HTTP from + strcatbuff(buff,"From: "); + strcatbuff(buff, retour->req.from); + strcatbuff(buff, H_CRLF); + } + // Présence d'un user-agent? if (retour->req.user_agent_send) { // ohh un user-agent char s[256]; @@ -1113,12 +1147,13 @@ int http_sendhead(t_cookie* cookie,int mode,char* xsend,char* adr,char* fil,char #endif // Envoi + HTS_STAT.last_request = mtime_local(); if (sendc(retour, buff)<0) { // ERREUR, socket rompue?... //if (sendc(retour->soc,buff) != strlen(buff)) { // ERREUR, socket rompue?... deletesoc_r(retour); // fermer tout de même // et tenter de reconnecter - strcpybuff(retour->msg,"Write error"); + strcpybuff(retour->msg, "Write error"); retour->soc=INVALID_SOCKET; } @@ -1411,7 +1446,7 @@ void treathead(t_cookie* cookie,char* adr,char* fil,htsblk* retour,char* rcvd) { char domain[256]; // domaine cookie (.netscape.com) char path[256]; // chemin (/) char cook_name[256]; // nom cookie (MYCOOK) - char cook_value[8192]; // valeur (ID=toto,S=1234) + char BIGSTK cook_value[8192]; // valeur (ID=toto,S=1234) #if DEBUG_COOK printf("set-cookie detected\n"); #endif @@ -1419,7 +1454,7 @@ void treathead(t_cookie* cookie,char* adr,char* fil,htsblk* retour,char* rcvd) { char *token_st,*token_end; char *value_st,*value_end; char name[256]; - char value[8192]; + char BIGSTK value[8192]; int next=0; name[0]=value[0]='\0'; // @@ -1679,6 +1714,11 @@ HTS_INLINE LLint http_fread1(htsblk* r) { LLint http_xfread1(htsblk* r,int bufl) { int nl=-1; + // EOF + if (r->totalsize > 0 && r->size == r->totalsize) { + return READ_EOF; + } + if (bufl>0) { if (!r->is_write) { // stocker en mémoire if (r->totalsize>0) { // totalsize déterminé ET ALLOUE @@ -1691,9 +1731,11 @@ LLint http_xfread1(htsblk* r,int bufl) { nl = hts_read(r,r->adr + ((int) r->size),(int) (r->totalsize-r->size) ); /* NO 32 bit overlow possible here (no 4GB html!) */ // nouvelle taille if (nl >= 0) r->size+=nl; - - if ((nl < 0) || (r->size >= r->totalsize)) - nl=-1; // break + + /* + if (r->size >= r->totalsize) + nl = -1; // break + */ r->adr[r->size]='\0'; // caractère NULL en fin au cas où l'on traite des HTML } @@ -1717,7 +1759,7 @@ LLint http_xfread1(htsblk* r,int bufl) { if (r->adr!=NULL) { // lecture nl = hts_read(r,r->adr+(int)r->size,bufl); - if (nl>0) { + if (nl > 0) { // resize r->adr=(char*) realloct(r->adr,(int)r->size+nl + 1); // nouvelle taille @@ -1737,7 +1779,7 @@ LLint http_xfread1(htsblk* r,int bufl) { } // pas de adr=erreur - if (r->adr==NULL) nl=-1; + if (r->adr == NULL) nl = READ_ERROR; } else { // stocker sur disque char* buff; @@ -1751,17 +1793,17 @@ LLint http_xfread1(htsblk* r,int bufl) { if ((INTsys)fwrite(buff,1,nl,r->out)!=nl) { r->statuscode=-1; strcpybuff(r->msg,"Write error on disk"); - nl=-1; + nl=READ_ERROR; } } - if ((nl < 0) || ((r->totalsize>0) && (r->size >= r->totalsize))) - nl=-1; // break + //if ((nl < 0) || ((r->totalsize>0) && (r->size >= r->totalsize))) + // nl=-1; // break // libérer bloc tempo freet(buff); } else - nl=-1; + nl=READ_ERROR; if ((nl < 0) && (r->out!=NULL)) { fflush(r->out); @@ -1783,7 +1825,7 @@ LLint http_xfread1(htsblk* r,int bufl) { int lf_detected=0; int at_begining=1; do { - nl=-1; + nl = READ_INTERNAL_ERROR; count--; if (r->adr==NULL) { r->adr=(char*) malloct(8192); @@ -1793,7 +1835,7 @@ LLint http_xfread1(htsblk* r,int bufl) { if (r->size < 8190) { // lecture nl = hts_read(r,r->adr+r->size,1); - if (nl>0) { + if (nl > 0) { // exit if: // lf detected AND already detected before // or @@ -1825,18 +1867,16 @@ LLint http_xfread1(htsblk* r,int bufl) { count=-1; } } while((nl >= 0) && (count>0)); - nl = tot_nl; + if (nl >= 0) { + nl = tot_nl; + } } -#if HDEBUG - //printf("add to %d / %d\n",r->size,r->totalsize); -#endif - // nl == 0 may mean "no relevant data", for example is using cache or ssl -#if HTS_USEOPENSSL - if (r->ssl) + // EOF + if (r->totalsize > 0 && r->size == r->totalsize) { + return READ_EOF; + } else { return nl; - else -#endif - return ((nl > 0) ? nl : -1); // ==0 is fatal if direct read + } } @@ -1977,20 +2017,20 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { char* iadr; // unsigned short int port; - // tester un éventuel id:pass et virer id:pass@ si détecté - iadr = jump_identification(_iadr); - // si iadr="#" alors c'est une fausse URL, mais un vrai fichier // local. // utile pour les tests! //## if (iadr[0]!=lOCAL_CHAR) { - if (strcmp(_iadr,"file://")) { /* non fichier */ + if (strcmp(_iadr,"file://") != 0) { /* non fichier */ SOCaddr server; int server_size=sizeof(server); t_hostent* hp; // effacer structure memset(&server, 0, sizeof(server)); + // tester un éventuel id:pass et virer id:pass@ si détecté + iadr = jump_identification(_iadr); + #if HDEBUG printf("gethostbyname\n"); #endif @@ -2007,7 +2047,7 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { port=80; // port par défaut #endif if (a) { - char iadr2[HTS_URLMAXSIZE*2]; + char BIGSTK iadr2[HTS_URLMAXSIZE*2]; int i=-1; iadr2[0]='\0'; sscanf(a+1,"%d",&i); @@ -2046,6 +2086,9 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { } // copie adresse SOCaddr_copyaddr(server, server_size, hp->h_addr_list[0], hp->h_length); + // make a copy for external clients + retour->address_size = sizeof(retour->address); + SOCaddr_copyaddr(retour->address, retour->address_size, hp->h_addr_list[0], hp->h_length); // memcpy(&SOCaddr_sinaddr(server), hp->h_addr_list[0], hp->h_length); // créer ("attachement") une socket (point d'accès) internet,en flot @@ -2060,7 +2103,7 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { retour->debugid = HTS_STAT.stat_sockid++; } #if HTS_WIDE_DEBUG - DEBUG_W("socket done\n"); + DEBUG_W("socket()=%d\n" _ (int) soc); #endif if (soc==INVALID_SOCKET) { if (retour) @@ -2103,6 +2146,7 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { #if HDEBUG printf("connect\n"); #endif + HTS_STAT.last_connect = mtime_local(); #if HTS_WIDE_DEBUG DEBUG_W("connect\n"); @@ -2113,10 +2157,6 @@ int newhttp(char* _iadr,htsblk* retour,int port,int waitconnect) { if (connect(soc, (struct sockaddr *)&server, server_size) == -1) { #endif - // no - non blocking - //deletesoc(soc); - //soc=INVALID_SOCKET; - // bloquant if (waitconnect) { #if HDEBUG @@ -2180,7 +2220,7 @@ int ident_url_absolute(char* url,char* adr,char* fil) { // 1. optional scheme ":" if ((pos=strfield(url,"file:"))) { // fichier local!! (pour les tests) - //!! p+=3; + //!!p+=3; strcpybuff(adr,"file://"); } else if ((pos=strfield(url,"http:"))) { // HTTP //!!p+=3; @@ -2233,15 +2273,19 @@ int ident_url_absolute(char* url,char* adr,char* fil) { char *p; int i; char* a; - + p=url+pos; if (*p == '/' || *p == '\\') { /* file:///.. */ strcatbuff(fil,p); // fichier local ; adr="#" } else { - strcatbuff(fil,"//"); /* file://server/foo */ - strcatbuff(fil,p); + if (p[1] != ':') { + strcatbuff(fil,"//"); /* file://server/foo */ + strcatbuff(fil,p); + } else { + strcatbuff(fil,p); // file://C:\.. + } } - + a=strchr(fil,'?'); if (a) *a='\0'; /* couper query (inutile pour file:// lors de la requête) */ @@ -2272,66 +2316,52 @@ int ident_url_absolute(char* url,char* adr,char* fil) { return 0; } -// simplification des ../ +/* simplify ../ and ./ */ void fil_simplifie(char* f) { - int i=0; - int last=0; - char* a; - - // éliminer ../ - while (f[i]) { - - if (f[i]=='/') { - if (f[i+1]=='.') - if (f[i+2]=='.') // couper dernier répertoire - if (f[i+3]=='/') // éviter les /tmp/..coolandlamedir/ - { // couper dernier répertoire - char tempo[HTS_URLMAXSIZE*2]; - tempo[0]='\0'; - // - if (!last) /* can't go upper.. */ - strcpybuff(tempo,"/"); - else - strncpy(tempo,f,last+1); - tempo[last+1]='\0'; - strcatbuff(tempo,f+i+4); - strcpybuff(f,tempo); // remplacer - i=-1; // recommencer - last=0; + char *a, *b; + char *rollback[128]; + int rollid = 0; + char lc = '/'; + int query = 0; + for(a = b = f ; *a != '\0' ; ) { + if (*a == '?') + query = 1; + if (query == 0 && lc == '/' && a[0] == '.' && a[1] == '/') { /* foo/./bar or ./foo */ + a += 2; + } + else if (query == 0 && lc == '/' && a[0] == '.' && a[1] == '.' && a[2] == '/') { /* foo/../bar or ../foo */ + a += 3; + if (rollid > 1) { + rollid--; + b = rollback[rollid - 1]; + } else { + rollid = 0; + b = f; } - - if (i>=0) - last=i; - else - last=0; + } else { + *b++ = lc = *a; + if (*a == '/') { + rollback[rollid++] = b; + if (rollid >= 127) { + *f = '\0'; /* ERROR */ + break; + } + } + a++; } - - i++; } - - // éliminer ./ - while ( (a=strstr(f,"./")) ) { - char tempo[HTS_URLMAXSIZE*2]; - tempo[0]='\0'; - strcpybuff(tempo,a+2); - strcpybuff(a,tempo); - } - // delete all remaining ../ (potential threat) - while ( (a=strstr(f,"../")) ) { - char tempo[HTS_URLMAXSIZE*2]; - tempo[0]='\0'; - strcpybuff(tempo,a+3); - strcpybuff(a,tempo); + *b = '\0'; + if (*f == '\0') { + f[0] = '.'; + f[1] = '/'; + f[2] = '\0'; } - } // fermer liaison fichier ou socket HTS_INLINE void deletehttp(htsblk* r) { #if HTS_DEBUG_CLOSESOCK - char info[256]; - sprintf(info,"deletehttp: (htsblk*) %d\n",r); - DEBUG_W2(info); + DEBUG_W("deletehttp: (htsblk*) 0x%p\n" _ (void*) r); #endif #if HTS_USEOPENSSL /* Free OpenSSL structures */ @@ -2357,27 +2387,22 @@ HTS_INLINE void deletehttp(htsblk* r) { // free the addr buffer // always returns 1 HTS_INLINE int deleteaddr(htsblk* r) { - if (r->adr) { + if (r->adr != NULL) { freet(r->adr); r->adr = NULL; } + if (r->headers != NULL) { + freet(r->headers); + r->headers = NULL; + } return 1; } // fermer une socket HTS_INLINE void deletesoc(T_SOC soc) { - if (soc!=INVALID_SOCKET) { -// J'ai planté.. pas de shutdown -//#if HTS_WIDE_DEBUG -// DEBUG_W("shutdown\n"); -//#endif -// shutdown(soc,2); // shutdown -//#if HTS_WIDE_DEBUG -// DEBUG_W("shutdown done\n"); -//#endif - // Ne pas oublier de fermer la connexion avant de partir.. (plus propre) + if (soc!=INVALID_SOCKET && soc!=LOCAL_SOCKET_ID) { #if HTS_WIDE_DEBUG - DEBUG_W("close\n"); + DEBUG_W("close %d\n" _ (int) soc); #endif #if HTS_WIN closesocket(soc); @@ -2385,7 +2410,7 @@ HTS_INLINE void deletesoc(T_SOC soc) { close(soc); #endif #if HTS_WIDE_DEBUG - DEBUG_W("close done\n"); + DEBUG_W(".. done\n"); #endif } } @@ -2400,8 +2425,10 @@ HTS_INLINE void deletesoc_r(htsblk* r) { r->ssl_con=NULL; } #endif - deletesoc(r->soc); - r->soc=INVALID_SOCKET; + if (r->soc!=INVALID_SOCKET) { + deletesoc(r->soc); + r->soc=INVALID_SOCKET; + } } // renvoi le nombre de secondes depuis 1970 @@ -2597,6 +2624,20 @@ int set_filetime_rfc822(char* file,char* date) { } else return -1; } +int get_filetime_rfc822(char* file,char* date) { + struct stat buf; + date[0] = '\0'; + if (stat(file, &buf) == 0) { + struct tm* A; + time_t tt = buf.st_mtime; + A=gmtime(&tt); + if (A==NULL) + A=localtime(&tt); + time_rfc822(date, A); + return 1; + } + return 0; +} // heure au format rfc (taille buffer 256o) HTS_INLINE void time_rfc822(char* s,struct tm * A) { @@ -2758,33 +2799,21 @@ int finput(int fd,char* s,int max) { } // Like linput, but in memory (optimized) -int binput(char* buff,char* s,int max) { - char* end; - int count; - - // clear buffer - s[0]='\0'; - // end of buffer? - if ( *buff == '\0') - return 1; - // find ending \n - end=strchr(buff,'\n'); - // ..or end of buffer - if (!end) - end=buff+strlen(buff); - // then count number of bytes, maximum=max - count=min(max,end-buff); - // and strip annoying ending cr - while( (count>0) && (buff[count] == '\r')) - count--; - // copy - if (count > 0) { - strncatbuff(s, buff, count); +int binput(char* buff, char* s, int max) { + int count = 0; + int destCount = 0; + + // Note: \0 will return 1 + while(count < max && buff != NULL && buff[count] != '\0' && buff[count] != '\n') { + if (buff[count] != '\r') { + s[destCount++] = buff[count]; + } + count++; } - // and terminate with a null char - s[count]='\0'; + s[destCount] = '\0'; + // then return the supplemental jump offset - return (end-buff)+1; + return count + 1; } // Lecture d'une ligne (peut être unicode à priori) @@ -2894,18 +2923,6 @@ void rawlinput(FILE* fp,char* s,int max) { s[j++]='\0'; } - -// compare le début de f avec s et retourne la position de la fin -// 'A=a' (case insensitive) -int strfield(const char* f,const char* s) { - int r=0; - while (streql(*f,*s) && ((*f)!=0) && ((*s)!=0)) { f++; s++; r++; } - if (*s==0) - return r; - else - return 0; -} - //cherche chaine, case insensitive char* strstrcase(char *s,char *o) { while((*s) && (strfield(s,o)==0)) s++; @@ -3006,8 +3023,8 @@ void map_characters(unsigned char* buffer, unsigned int size, unsigned int* map) // 1 : oui // -1 : on sait pas // -2 : on sait pas, pas d'extension -int ishtml(char* fil) { - char *a; +int ishtml(const char* fil) { + const char *a; // patch pour les truc.html?Choix=toto if ( (a=strchr(fil,'?')) ) // paramètres? @@ -3020,19 +3037,20 @@ int ishtml(char* fil) { while ( (*a!='.') && (*a!='/') && ( a > fil)) a--; if (*a=='.') { // a une extension - char fil_noquery[HTS_URLMAXSIZE*2]; + char BIGSTK fil_noquery[HTS_URLMAXSIZE*2]; + char* b; fil_noquery[0]='\0'; a++; // pointer sur extension strncatbuff(fil_noquery,a,HTS_URLMAXSIZE); - a=strchr(fil_noquery,'?'); - if (a) - *a='\0'; + b=strchr(fil_noquery,'?'); + if (b) + *b='\0'; return ishtml_ext(fil_noquery); // retour } else return -2; // indéterminé, par exemple /truc } // idem, mais pour uniquement l'extension -int ishtml_ext(char* a) { +int ishtml_ext(const char* a) { int html=0; // if (strfield2(a,"html")) html = 1; @@ -3075,6 +3093,8 @@ HTS_INLINE int ishttperror(int err) { // une identification HTSEXT_API char* jump_identification(char* source) { char *a,*trytofind; + if (strcmp(source, "file://") == 0) + return source; // rechercher dernier @ (car parfois email transmise dans adresse!) // mais sauter ftp:// éventuel a = jump_protocol(source); @@ -3083,6 +3103,8 @@ HTSEXT_API char* jump_identification(char* source) { } HTSEXT_API char* jump_normalized(char* source) { + if (strcmp(source, "file://") == 0) + return source; source = jump_identification(source); if (strfield(source, "www") && source[3] != '\0') { if (source[3] == '.') { // www.foo.com -> foo.com @@ -3098,25 +3120,80 @@ HTSEXT_API char* jump_normalized(char* source) { return source; } -HTSEXT_API char* fil_normalized(char* source, char* dest_) { - char* dest=dest_; +static int sortNormFnc(const void * a_, const void * b_) { + char** a = (char**) a_; + char** b = (char**) b_; + return strcmp(*a+1, *b+1); +} + + +HTSEXT_API char* fil_normalized(char* source, char* dest) { char lastc = 0; int gotquery=0; - while(*source) { - if (*source == '?') - gotquery=1; + int ampargs=0; + int i,j; + char* query=NULL; + for(i=j=0 ; source[i] != '\0'; i++) { + if (!gotquery && source[i] == '?') + gotquery=ampargs=1; if ( - (!gotquery && lastc == '/' && *source == '/') // foo//bar -> foo/bar + (!gotquery && lastc == '/' && source[i] == '/') // foo//bar -> foo/bar ) { } else { - *dest++ = *source; + if (gotquery && source[i] == '&') { + ampargs++; + } + dest[j++] = source[i]; + } + lastc = source[i]; + } + dest[j++] = '\0'; + + /* Sort arguments (&foo=1&bar=2 == &bar=2&foo=1) */ + if (ampargs > 1) { + char** amps = malloct(ampargs * sizeof(char*)); + char* copyBuff = NULL; + int qLen=0; + assertf(amps != NULL); + gotquery = 0; + for(i=j=0 ; dest[i] != '\0'; i++) { + if ( (gotquery && dest[i] == '&') || ( !gotquery && dest[i] == '?') ) { + if (!gotquery) { + gotquery=1; + query = &dest[i]; + qLen = (int)strlen(query); + } + assertf(j < ampargs); + amps[j++] = &dest[i]; + dest[i] = '\0'; + } } - lastc = *source; - source++; + assertf(j == ampargs); + + /* Sort 'em all */ + qsort(amps, ampargs, sizeof(char*), sortNormFnc); + + /* Replace query by sorted query */ + copyBuff = malloct(qLen + 1); + assertf(copyBuff != NULL); + copyBuff[0] = '\0'; + for(i = 0 ; i < ampargs ; i++) { + if (i == 0) + strcatbuff(copyBuff, "?"); + else + strcatbuff(copyBuff, "&"); + strcatbuff(copyBuff, amps[i] + 1); + } + assert((int)strlen(copyBuff) <= qLen); + strcpybuff(query, copyBuff); + + /* Cleanup */ + freet(amps); + freet(copyBuff); } - *dest++ = '\0'; - return dest_; + + return dest; } #define endwith(a) ( (len >= (sizeof(a)-1)) ? ( strncmp(dest, a+len-(sizeof(a)-1), sizeof(a)-1) == 0 ) : 0 ); @@ -3154,6 +3231,21 @@ char* strrchr_limit(char* s, char c, char* limit) { } } +// strrchr, but not too far +char* strstr_limit(char* s, char* sub, char* limit) { + if (limit == NULL) { + return strstr(s, sub); + } else { + char* pos = strstr(s, sub); + if (pos != NULL) { + char* farpos = strstr(s, limit); + if (farpos == NULL || pos < farpos) + return pos; + } + } + return NULL; +} + // retourner adr sans ftp:// HTS_INLINE char* jump_protocol(char* source) { int p; @@ -3456,7 +3548,7 @@ HTSEXT_API void unescape_amp(char* s) { c='~'; // remplacer? if (c) { - char buff[HTS_URLMAXSIZE*2]; + char BIGSTK buff[HTS_URLMAXSIZE*2]; buff[0]=(char) c; strcpybuff(buff+1,end+1); strcpybuff(s,buff); @@ -3467,6 +3559,17 @@ HTSEXT_API void unescape_amp(char* s) { } } +static int ehexh(char c) { + if ((c>='0') && (c<='9')) return c-'0'; + if ((c>='a') && (c<='f')) c-=('a'-'A'); + if ((c>='A') && (c<='F')) return (c-'A'+10); + return 0; +} + +static int ehex(char* s) { + return 16*ehexh(*s)+ehexh(*(s+1)); +} + // remplacer %20 par ' ', | par : etc.. // buffer MAX 1Ko HTSEXT_API char* unescape_http(char* s) { @@ -3564,7 +3667,7 @@ HTSEXT_API void escape_remove_control(char* s) { unsigned char* ss = (unsigned char*) s; while(*ss) { if (*ss < 32) { /* CONTROL characters go away! */ - char tmp[HTS_URLMAXSIZE*2]; + char BIGSTK tmp[HTS_URLMAXSIZE*2]; strcpybuff(tmp, ss+1); strcpybuff(ss, tmp); } else { @@ -3573,6 +3676,25 @@ HTSEXT_API void escape_remove_control(char* s) { } } +HTSEXT_API void x_escape_html(char* s) { + while(*s) { + int test=0; + test = ( + CHAR_HIG(*s) + || CHAR_XXAVOID(*s) ); + + if (test) { + char BIGSTK buffer[HTS_URLMAXSIZE*3]; + int n; + n = (int)(unsigned char) *s; + strcpybuff(buffer, s+1); + sprintf(s,"&#x%02x;", n); + strcatbuff(s, buffer); + } + s++; + } +} + HTSEXT_API void x_escape_http(char* s,int mode) { while(*s) { @@ -3588,7 +3710,7 @@ HTSEXT_API void x_escape_http(char* s,int mode) { || CHAR_MARK(*s)); } else if (mode==2) - test=(strchr(" ",*s)!=0); // n'escaper que espace + test=(*s == ' '); // n'escaper que espace else if (mode==3) { // échapper que ce qui est nécessaire test = ( CHAR_SPECIAL(*s) @@ -3601,7 +3723,7 @@ HTSEXT_API void x_escape_http(char* s,int mode) { } if (test) { - char buffer[HTS_URLMAXSIZE*3]; + char BIGSTK buffer[HTS_URLMAXSIZE*3]; int n; n=(int)(unsigned char) *s; strcpybuff(buffer,s+1); @@ -3612,18 +3734,34 @@ HTSEXT_API void x_escape_http(char* s,int mode) { } } +HTSEXT_API void escape_for_html_print(char* s, char* d) { + for( ; *s ; s++) { + if (*s == '&') { + strcpybuff(d, "&"); + d += strlen(d); + } else { + *d++ = *s; + } + } + *d = '\0'; +} -HTS_INLINE int ehexh(char c) { - if ((c>='0') && (c<='9')) return c-'0'; - if ((c>='a') && (c<='f')) c-=('a'-'A'); - if ((c>='A') && (c<='F')) return (c-'A'+10); - return 0; +HTSEXT_API void escape_for_html_print_full(char* s, char* d) { + for( ; *s ; s++) { + if (*s == '&') { + strcpybuff(d, "&"); + d += strlen(d); + } else if (CHAR_HIG(*s)) { + sprintf(d, "&#x%02x;", (unsigned char) *s); + d += strlen(d); + } else { + *d++ = *s; + } + } + *d = '\0'; } -HTS_INLINE int ehex(char* s) { - return 16*ehexh(*s)+ehexh(*(s+1)); -} // concat, concatène deux chaines et renvoi le résultat // permet d'alléger grandement le code @@ -3731,18 +3869,18 @@ HTS_INLINE int is_realspace(char c) { // deviner type d'un fichier local.. // ex: fil="toto.gif" -> s="image/gif" -void guess_httptype(char *s,char *fil) { +void guess_httptype(char *s,const char *fil) { get_httptype(s,fil,1); } // idem // flag: 1 si toujours renvoyer un type -void get_httptype(char *s,char *fil,int flag) { +void get_httptype(char *s,const char *fil,int flag) { if (ishtml(fil)==1) strcpybuff(s,"text/html"); else { - char *a=fil+strlen(fil)-1; + const char *a=fil+strlen(fil)-1; while ( (*a!='.') && (*a!='/') && (a>fil)) a--; - if (*a=='.') { + if (*a=='.' && strlen(a) < 32) { int ok=0; int j=0; a++; @@ -3766,7 +3904,7 @@ void get_httptype(char *s,char *fil,int flag) { // get type of fil (php) // s: buffer (text/html) or NULL // return: 1 if known by user -int get_userhttptype(int setdefs,char *s,char *ext) { +int get_userhttptype(int setdefs,char *s,const char *ext) { char** buffer=NULL; NOSTATIC_RESERVE(buffer, char*, 1); if (setdefs) { @@ -3778,7 +3916,7 @@ int get_userhttptype(int setdefs,char *s,char *ext) { if (!ext) return 0; if (*buffer) { - char search[1024]; + char BIGSTK search[1024]; char* detect; sprintf(search,"\n%s=",ext); // php=text/html detect=strstr(*buffer,search); @@ -3844,7 +3982,7 @@ void give_mimext(char *s,char *st) { // 0 : non // 1 : oui // 2 : html -int is_knowntype(char *fil) { +int is_knowntype(const char *fil) { int j=0; if (!fil) return 0; @@ -3862,19 +4000,20 @@ int is_knowntype(char *fil) { return (is_userknowntype(fil)); } // extension : html,gif.. -char* get_ext(char *fil) { +char* get_ext(const char *fil) { char* fil_noquery; - char *a=fil+strlen(fil)-1; + const char *a=fil+strlen(fil)-1; NOSTATIC_RESERVE(fil_noquery, char, HTS_URLMAXSIZE*2); while ( (*a!='.') && (*a!='/') && (a>fil)) a--; if (*a=='.') { + char* b; fil_noquery[0]='\0'; a++; // pointer sur extension strncatbuff(fil_noquery,a,HTS_URLMAXSIZE); - a=strchr(fil_noquery,'?'); - if (a) - *a='\0'; + b=strchr(fil_noquery,'?'); + if (b) + *b='\0'; return concat(fil_noquery,""); } else @@ -3886,8 +4025,8 @@ char* get_ext(char *fil) { // 2 : html // setdefs : set mime buffer: // file=(char*) "asp=text/html\nphp=text/html\n" -int is_userknowntype(char *fil) { - char mime[1024]; +int is_userknowntype(const char *fil) { + char BIGSTK mime[1024]; if (!fil) return 0; if (!strnotempty(fil)) @@ -3904,7 +4043,7 @@ int is_userknowntype(char *fil) { // page dynamique? // is_dyntype(get_ext("foo.asp")) -int is_dyntype(char *fil) { +int is_dyntype(const char *fil) { int j=0; if (!fil) return 0; @@ -3921,11 +4060,12 @@ int is_dyntype(char *fil) { // types critiques qui ne doivent pas être changés car renvoyés par des serveurs qui ne // connaissent pas le type -int may_unknown(char* st) { +int may_unknown(const char* st) { int j=0; // types média - if (may_be_hypertext_mime(st)) + if (may_be_hypertext_mime(st, "")) { return 1; + } while(strnotempty(hts_mime_keep[j])) { if (strfield2(hts_mime_keep[j],st)) { // trouvé return 1; @@ -3936,7 +4076,6 @@ int may_unknown(char* st) { } - // -- Utils fichiers // pretty print for i/o @@ -4106,23 +4245,24 @@ int HTS_TOTAL_RECV_CHECK(int var) { #endif // Lecture dans buff de size octets au maximum en utilisant la socket r (structure htsblk) +// returns: // >0 : data received // == 0 : not yet data -// <0 : no more data or error +// <0: error or no data: READ_ERROR, READ_EOF or READ_TIMEOUT HTS_INLINE int hts_read(htsblk* r,char* buff,int size) { int retour; // return read(soc,buff,size); if (r->is_file) { #if HTS_WIDE_DEBUG - DEBUG_W("read\n"); + DEBUG_W("read(%p, %d, %d)\n" _ (void*) buff _ (int) size _ (int) r->fp); #endif if (r->fp) - retour=(int)fread(buff,1,size,r->fp); + retour = (int)fread(buff,1,size,r->fp); else - retour=-1; + retour = READ_ERROR; } else { #if HTS_WIDE_DEBUG - DEBUG_W("recv\n"); + DEBUG_W("recv(%d, %p, %d)\n" _ (int) r->soc _ (void*) buff _ (int) size); if (r->soc==INVALID_SOCKET) printf("!!WIDE_DEBUG ERROR, soc==INVALID hts_read\n"); #endif @@ -4139,13 +4279,20 @@ HTS_INLINE int hts_read(htsblk* r,char* buff,int size) { ) { retour = 0; /* no data yet (ssl cache) */ + } else if (err_code == SSL_ERROR_ZERO_RETURN) { + retour = READ_EOF; /* completed */ } else { - retour = -1; /* eof or error */ + retour = READ_ERROR; /* eof or error */ } } } else { #endif retour=recv(r->soc,buff,size,0); + if (retour == 0) { + retour = READ_EOF; + } else if (retour < 0) { + retour = READ_ERROR; + } } if (retour > 0) // compter flux entrant HTS_STAT.HTS_TOTAL_RECV+=retour; @@ -4153,7 +4300,7 @@ HTS_INLINE int hts_read(htsblk* r,char* buff,int size) { } #endif #if HTS_WIDE_DEBUG - DEBUG_W("recv/read done\n"); + DEBUG_W("recv/read done (%d bytes)\n" _ (int) retour); #endif return retour; } @@ -4179,7 +4326,7 @@ static void hts_cache_free_(t_dnscache* cache) { } } void hts_cache_free(t_dnscache* cache) { - if (cache != NULL) { + if (cache != NULL && cache->n != NULL) { hts_cache_free_(cache->n); cache->n = NULL; } @@ -4218,7 +4365,7 @@ int _hts_lockdns(int i) { // si h_length==0 alors le nom n'existe pas dans le dns t_hostent* _hts_ghbn(t_dnscache* cache,char* iadr,t_hostent* retour) { // attendre que le cache dns soit prêt - while(_hts_lockdns(-1)); // attendre libération + //while(_hts_lockdns(-1)); // attendre libération _hts_lockdns(1); // locker while(1) { @@ -4273,7 +4420,7 @@ int hts_dnstest(char* _iadr) { #endif return 1; - while(_hts_lockdns(-1)); // attendre libération + // while(_hts_lockdns(-1)); // attendre libération _hts_lockdns(1); // locker while(1) { if (strcmp(cache->iadr,iadr)==0) { // ok trouvé @@ -4306,7 +4453,7 @@ HTSEXT_API t_hostent* vxgethostbyname(char* hostname, void* v_buffer) { The resolver doesn't seem to handle IP6 addresses in brackets */ if ((hostname[0] == '[') && (hostname[strlen(hostname)-1] == ']')) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncatbuff(tempo, hostname+1, strlen(hostname)-2); strcpybuff(hostname, tempo); @@ -4366,7 +4513,7 @@ HTSEXT_API t_hostent* vxgethostbyname(char* hostname, void* v_buffer) { // cache dns interne à HTS // ** FREE A FAIRE sur la chaine t_hostent* hts_gethostbyname(char* _iadr, void* v_buffer) { - char iadr[HTS_URLMAXSIZE*2]; + char BIGSTK iadr[HTS_URLMAXSIZE*2]; t_fullhostent* buffer = (t_fullhostent*) v_buffer; t_dnscache* cache=_hts_cache(); // adresse du cache t_hostent* hp; @@ -4499,6 +4646,13 @@ void* hts_calloc(size_t len,size_t len2) { memset(adr, 0, len * len2); return adr; } +void* hts_strdup(char* str) { + size_t size = str ? strlen(str) : 0; + char* adr = (char*) hts_malloc(size + 1); + fassert(adr != NULL); + strcpy(adr, str ? str : ""); + return adr; +} void* hts_xmalloc(size_t len,size_t len2) { mlink* lnk = (mlink*) calloc(1,sizeof(mlink)); fassert(lnk != NULL); @@ -4665,13 +4819,67 @@ int ftp_available(void) { #endif +int hts_dgb_init = 0; +FILE* hts_dgb_init_fp = NULL; +static void hts_dgb(char* msg); +HTSEXT_API void hts_debug(int level) { + hts_dgb_init = level; + if (hts_dgb_init > 0) { + hts_dgb("hts_debug() called"); + } +} +static void hts_dgb(char* msg) { + if (hts_dgb_init > 0) { + if (hts_dgb_init_fp == NULL) { +#ifdef _WIN32_WCE + hts_dgb_init_fp = fopen("\\Temp\\hts-debug.txt", "wb"); +#else + hts_dgb_init_fp = fopen("hts-debug.txt", "wb"); +#endif + if (hts_dgb_init_fp != NULL) { + fprintf(hts_dgb_init_fp, "* Creating file\r\n"); + } + } + if (hts_dgb_init_fp != NULL) { + fprintf(hts_dgb_init_fp, "%s\r\n", msg); + fflush(hts_dgb_init_fp); + } + } +} HTSEXT_API int hts_init(void) { static int hts_init_ok = 0; + hts_dgb("entering hts_init()"); /* debug */ + +#ifdef _WIN32_WCE +#ifndef HTS_CECOMPAT + xceinit(L""); +#endif +#endif + + /* Init threads */ + if (!hts_init_ok) { + htsthread_init(); + } + /* Ensure external modules are loaded */ + hts_dgb("calling htspe_init()"); /* debug */ htspe_init(); + /* MD5 Auto-test */ + { + char digest[32 + 2]; + unsigned char* atest = (unsigned char*)"MD5 Checksum Autotest"; + digest[0] = '\0'; + domd5mem(atest, strlen(atest), digest, 1); /* a42ec44369da07ace5ec1d660ba4a69a */ + if (strcmp(digest, "a42ec44369da07ace5ec1d660ba4a69a") != 0) { + int fatal_broken_md5 = 0; + assertf(fatal_broken_md5); + } + } + + hts_dgb("initializing default wrappers"); /* debug */ if (!hts_init_ok) { hts_init_ok = 1; // default wrappers @@ -4681,6 +4889,8 @@ HTSEXT_API int hts_init(void) { htswrap_add("start",htsdefault_start); htswrap_add("change-options",htsdefault_chopt); htswrap_add("end",htsdefault_end); + htswrap_add("preprocess-html",htsdefault_preprocesshtml); + htswrap_add("postprocess-html",htsdefault_postprocesshtml); htswrap_add("check-html",htsdefault_checkhtml); htswrap_add("loop",htsdefault_loop); htswrap_add("query",htsdefault_query); @@ -4690,10 +4900,14 @@ HTSEXT_API int hts_init(void) { htswrap_add("pause",htsdefault_pause); htswrap_add("save-file",htsdefault_filesave); htswrap_add("link-detected",htsdefault_linkdetected); + htswrap_add("link-detected2",htsdefault_linkdetected2); htswrap_add("transfer-status",htsdefault_xfrstatus); htswrap_add("save-name",htsdefault_savename); + htswrap_add("send-header",htsdefault_sendheader); + htswrap_add("receive-header",htsdefault_receiveheader); } + hts_dgb("initializing SSL"); /* debug */ #if HTS_USEOPENSSL /* Initialize the OpensSSL library @@ -4715,14 +4929,17 @@ HTSEXT_API int hts_init(void) { #endif /* Init vars and thread-specific values */ + hts_dgb("initializing variables"); /* debug */ hts_initvar(); /* initialiser structcheck */ // structcheck_init(1); + hts_dgb("ending hts_init()"); /* debug */ return 1; } HTSEXT_API int hts_uninit(void) { + //htsthread_uninit(); hts_cache_free(_hts_cache()); hts_freevar(); /* htswrap_free(); */ @@ -4744,6 +4961,12 @@ int __cdecl htsdefault_chopt(void* opt) { int __cdecl htsdefault_end(void) { return 1; } +int __cdecl htsdefault_preprocesshtml(char** html,int* len,char* url_adresse,char* url_fichier) { + return 1; +} +int __cdecl htsdefault_postprocesshtml(char** html,int* len,char* url_adresse,char* url_fichier) { + return 1; +} int __cdecl htsdefault_checkhtml(char* html,int len,char* url_adresse,char* url_fichier) { return 1; } @@ -4772,12 +4995,21 @@ void __cdecl htsdefault_filesave(char* file) { int __cdecl htsdefault_linkdetected(char* link) { return 1; } +int __cdecl htsdefault_linkdetected2(char* link, char* start_tag) { + return 1; +} int __cdecl htsdefault_xfrstatus(void* back) { return 1; } int __cdecl htsdefault_savename(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save) { return 1; } +int __cdecl htsdefault_sendheader(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* outgoing) { + return 1; +} +int __cdecl htsdefault_receiveheader(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* incoming) { + return 1; +} // end defaut wrappers diff --git a/src/htslib.h b/src/htslib.h index d3881d3..23a8400 100644 --- a/src/htslib.h +++ b/src/htslib.h @@ -43,7 +43,9 @@ Please visit our Website: http://www.httrack.com #include "htsglobal.h" /* basic net definitions */ +#include "htsbase.h" #include "htsbasenet.h" +#include "htsnet.h" /* cookies et auth */ #include "htsbauth.h" @@ -52,12 +54,15 @@ Please visit our Website: http://www.httrack.com // (à modifier avec celle-ci) #define POSTTOK "?>post" -#include - #include "htsopt.h" +#define READ_ERROR (-1) +#define READ_EOF (-2) +#define READ_TIMEOUT (-3) +#define READ_INTERNAL_ERROR (-4) + // structure pour paramètres supplémentaires lors de la requête -typedef struct { +typedef struct htsrequest { short int user_agent_send; // user agent (ex: httrack/1.0 [sun]) short int http11; // l'en tête peut (doit) être signé HTTP/1.1 et non HTTP/1.0 short int nokeepalive; // pas de keep-alive @@ -65,13 +70,15 @@ typedef struct { short int nocompression; // Pas de compression short int flush_garbage; // recycled char user_agent[128]; + char referer[256]; + char from[256]; char lang_iso[64]; t_proxy proxy; // proxy } htsrequest; // structure pour retour d'une connexion/prise d'en tête -typedef struct { +typedef struct htsblk { int statuscode; // status-code, -1=erreur, 200=OK,201=..etc (cf RFC1945) short int notmodified; // page ou fichier NON modifié (transféré) short int is_write; // sortie sur disque (out) ou en mémoire (adr) @@ -83,6 +90,7 @@ typedef struct { int keep_alive_t; // KA timeout int keep_alive_max; // KA number of requests char* adr; // adresse du bloc de mémoire, NULL=vide + char* headers; // adresse des en têtes si présents FILE* out; // écriture directe sur disque (si is_write=1) LLint size; // taille fichier char msg[80]; // message éventuel si échec ("\0"=non précisé) @@ -93,6 +101,8 @@ typedef struct { LLint totalsize; // taille totale à télécharger (-1=inconnue) short int is_file; // ce n'est pas une socket mais un descripteur de fichier si 1 T_SOC soc; // ID socket + SOCaddr address; // IP address + int address_size; // IP address structure length FILE* fp; // fichier pour file:// #if HTS_USEOPENSSL short int ssl; // is this connection a SSL one? (https) @@ -105,8 +115,8 @@ typedef struct { LLint crange; // Content-Range int debugid; // debug connection /* */ - htsrequest req; // paramètres pour la requête - /*char digest[32+2]; // digest md5 généré par le moteur ("" si non généré)*/ + htsrequest req; // paramètres pour la requête + /*char digest[32+2]; // digest md5 généré par le moteur ("" si non généré)*/ } htsblk; @@ -144,12 +154,8 @@ typedef struct t_dnscache { - -/* -#ifdef __cplusplus -extern "C" { -#endif -*/ +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE // fonctions unix/winsock int hts_read(htsblk* r,char* buff,int size); @@ -215,6 +221,7 @@ void time_local_rfc822(char* s); struct tm* convert_time_rfc822(char* s); int set_filetime(char* file,struct tm* tm_time); int set_filetime_rfc822(char* file,char* date); +int get_filetime_rfc822(char* file,char* date); HTS_INLINE void time_rfc822(char* s,struct tm * A); HTS_INLINE void time_rfc822_local(char* s,struct tm * A); #ifndef HTTRACK_DEFLIB @@ -232,25 +239,23 @@ int linputsoc_t(T_SOC soc, char* s, int max, int timeout); int linput_trim(FILE* fp,char* s,int max); int linput_cpp(FILE* fp,char* s,int max); void rawlinput(FILE* fp,char* s,int max); -int strfield(const char* f,const char* s); -#define strfield2(f,s) ( (strlen(f)!=strlen(s)) ? 0 : (strfield(f,s)) ) char* strstrcase(char *s,char *o); int ident_url_absolute(char* url,char* adr,char* fil); void fil_simplifie(char* f); int is_unicode_utf8(unsigned char* buffer, unsigned int size); void map_characters(unsigned char* buffer, unsigned int size, unsigned int* map); -int ishtml(char* urlfil); -int ishtml_ext(char* a); +int ishtml(const char* urlfil); +int ishtml_ext(const char* a); int ishttperror(int err); -void guess_httptype(char *s,char *fil); -void get_httptype(char *s,char *fil,int flag); -int get_userhttptype(int setdefs,char *s,char *ext); +void guess_httptype(char *s,const char *fil); +void get_httptype(char *s,const char *fil,int flag); +int get_userhttptype(int setdefs,char *s,const char *ext); void give_mimext(char *s,char *st); -int is_knowntype(char *fil); -int is_userknowntype(char *fil); -int is_dyntype(char *fil); -char* get_ext(char *fil); -int may_unknown(char* st); +int is_knowntype(const char *fil); +int is_userknowntype(const char *fil); +int is_dyntype(const char *fil); +char* get_ext(const char *fil); +int may_unknown(const char* st); #ifndef HTTRACK_DEFLIB HTSEXT_API char* jump_identification(char*); HTSEXT_API char* jump_normalized(char*); @@ -259,6 +264,7 @@ HTSEXT_API char* fil_normalized(char* source, char* dest); HTSEXT_API char* adr_normalized(char* source, char* dest); #endif char* strrchr_limit(char* s, char c, char* limit); +char* strstr_limit(char* s, char* sub, char* limit); HTS_INLINE char* jump_protocol(char* source); void code64(unsigned char* a,int size_a,unsigned char* b,int crlf); #ifndef HTTRACK_DEFLIB @@ -270,15 +276,16 @@ HTSEXT_API void escape_uri_utf(char* s); HTSEXT_API void escape_check_url(char* s); HTSEXT_API char* escape_check_url_addr(char* s); HTSEXT_API void x_escape_http(char* s,int mode); +HTSEXT_API void x_escape_html(char* s); HTSEXT_API void escape_remove_control(char* s); +HTSEXT_API void escape_for_html_print(char* s, char* d); +HTSEXT_API void escape_for_html_print_full(char* s, char* d); #endif -int ehexh(char c); #ifndef HTTRACK_DEFLIB HTSEXT_API char* unescape_http(char* s); HTSEXT_API char* unescape_http_unharm(char* s, int no_high); HTSEXT_API char* antislash_unescaped(char* s); #endif -int ehex(char* s); char* concat(const char* a,const char* b); #define copychar(a) concat((a),NULL) #if HTS_DOSNAME @@ -296,14 +303,6 @@ char* concat(const char* a,const char* b); void hts_lowcase(char* s); void hts_replace(char *s,char from,char to); -/* Spaces: CR,LF,TAB,FF */ -#define is_space(c) ( ((c)==' ') || ((c)=='\"') || ((c)==10) || ((c)==13) || ((c)==9) || ((c)==12) || ((c)==11) || ((c)=='\'') ) -#define is_realspace(c) ( ((c)==' ') || ((c)==10) || ((c)==13) || ((c)==9) || ((c)==12) || ((c)==11) ) -#define is_taborspace(c) ( ((c)==' ') || ((c)==9) ) -#define is_quote(c) ( ((c)=='\"') || ((c)=='\'') ) -#define is_retorsep(c) ( ((c)==10) || ((c)==13) || ((c)==9) ) -//HTS_INLINE int is_space(char); -//HTS_INLINE int is_realspace(char); void fprintfio(FILE* fp,char* buff,char* prefix); @@ -328,11 +327,6 @@ typedef void* ( *beginthread_type )( void * ); unsigned long _beginthread( beginthread_type start_address, unsigned stack_size, void *arglist ); #endif -/* -#ifdef __cplusplus -} -#endif -*/ @@ -360,6 +354,8 @@ void __cdecl htsdefault_uninit(void); int __cdecl htsdefault_start(void* opt); int __cdecl htsdefault_chopt(void* opt); int __cdecl htsdefault_end(void); +int __cdecl htsdefault_preprocesshtml(char** html,int* len,char* url_adresse,char* url_fichier); +int __cdecl htsdefault_postprocesshtml(char** html,int* len,char* url_adresse,char* url_fichier); int __cdecl htsdefault_checkhtml(char* html,int len,char* url_adresse,char* url_fichier); int __cdecl htsdefault_loop(void* back,int back_max,int back_index,int lien_n,int lien_tot,int stat_time,hts_stat_struct* stats); char* __cdecl htsdefault_query(char* question); @@ -369,10 +365,150 @@ int __cdecl htsdefault_check(char* adr,char* fil,int status); void __cdecl htsdefault_pause(char* lockfile); void __cdecl htsdefault_filesave(char*); int __cdecl htsdefault_linkdetected(char* link); +int __cdecl htsdefault_linkdetected2(char* link, char* tag_start); int __cdecl htsdefault_xfrstatus(void* back); int __cdecl htsdefault_savename(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save); +int __cdecl htsdefault_sendheader(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* outgoing); +int __cdecl htsdefault_receiveheader(char* buff, char* adr, char* fil, char* referer_adr, char* referer_fil, htsblk* incoming); + // end defaut wrappers + +// htsmodule.c definitions +extern void* getFunctionPtr(httrackp* opt, char* file, char* fncname); +extern void clearCallbacks(htscallbacks* chain); + + + +#endif // internals + + +/* Spaces: CR,LF,TAB,FF */ +#define is_space(c) ( ((c)==' ') || ((c)=='\"') || ((c)==10) || ((c)==13) || ((c)==9) || ((c)==12) || ((c)==11) || ((c)=='\'') ) +#define is_realspace(c) ( ((c)==' ') || ((c)==10) || ((c)==13) || ((c)==9) || ((c)==12) || ((c)==11) ) +#define is_taborspace(c) ( ((c)==' ') || ((c)==9) ) +#define is_quote(c) ( ((c)=='\"') || ((c)=='\'') ) +#define is_retorsep(c) ( ((c)==10) || ((c)==13) || ((c)==9) ) +//HTS_INLINE int is_space(char); +//HTS_INLINE int is_realspace(char); + +// compare le début de f avec s et retourne la position de la fin +// 'A=a' (case insensitive) +static int strfield(const char* f,const char* s) { + int r=0; + while (streql(*f,*s) && ((*f)!=0) && ((*s)!=0)) { f++; s++; r++; } + if (*s==0) + return r; + else + return 0; +} +static int strcmpnocase(char* a,char* b) { + while(*a) { + int cmp = hichar(*a) - hichar(*b); + if (cmp != 0) + return cmp; + a++; + b++; + } + return 0; +} + +#ifdef _WIN32 +#define strcasecmp(a,b) stricmp(a,b) +#define strncasecmp(a,b,n) strnicmp(a,b,n) +#endif + +#define strfield2(f,s) ( (strlen(f)!=strlen(s)) ? 0 : (strfield(f,s)) ) + +// is this MIME an hypertext MIME (text/html), html/js-style or other script/text type? +#define HTS_HYPERTEXT_DEFAULT_MIME "text/html" +#define is_hypertext_mime__(a) \ + ( (strfield2((a),"text/html")!=0)\ + || (strfield2((a),"application/x-javascript")!=0) \ + || (strfield2((a),"text/css")!=0) \ + /*|| (strfield2((a),"text/vnd.wap.wml")!=0)*/ \ + || (strfield2((a),"image/svg+xml")!=0) \ + || (strfield2((a),"image/svg-xml")!=0) \ + /*|| (strfield2((a),"audio/x-pn-realaudio")!=0) */\ + || (strfield2((a),"application/x-authorware-map")!=0) \ + ) +#define may_be_hypertext_mime__(a) \ + (\ + (strfield2((a),"audio/x-pn-realaudio")!=0) \ + || (strfield2((a),"audio/x-mpegurl")!=0) \ + ) + + +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + +// check if (mime, file) is hypertext +static int is_hypertext_mime(const char* mime, const char* file) { + if (is_hypertext_mime__(mime)) + return 1; + if (may_unknown(mime)) { + char guessed[256]; + guessed[0] = '\0'; + guess_httptype(guessed, file); + return is_hypertext_mime__(guessed); + } + return 0; +} + +// check if (mime, file) might be "false" hypertext +static int may_be_hypertext_mime(const char* mime, const char* file) { + if (may_be_hypertext_mime__(mime)) + return 1; + if (file != NULL && file[0] != '\0' && may_unknown(mime)) { + char guessed[256]; + guessed[0] = '\0'; + guess_httptype(guessed, file); + return may_be_hypertext_mime__(guessed); + } + return 0; +} + +// compare (mime, file) with reference +static int compare_mime(const char* mime, const char* file, const char* reference) { + if (is_hypertext_mime__(mime) || may_be_hypertext_mime__(mime)) + return strfield2(mime, reference); + if (file != NULL && file[0] != '\0' && may_unknown(mime)) { + char guessed[256]; + guessed[0] = '\0'; + guess_httptype(guessed, file); + return strfield2(guessed, reference); + } + return 0; +} + +#endif + +#ifdef _WIN32_WCE_XXC +extern char cwd[MAX_PATH+1]; +static char *getcwd_ce(char *buffer, int maxlen) +{ + TCHAR fileUnc[MAX_PATH+1]; + char* plast; + + if(cwd[0] == 0) + { + GetModuleFileName(NULL, fileUnc, MAX_PATH); + WideCharToMultiByte(CP_ACP, 0, fileUnc, -1, cwd, MAX_PATH, NULL, NULL); + plast = strrchr(cwd, '\\'); + if(plast) + *plast = 0; + /* Special trick to keep start menu clean... */ + if(_stricmp(cwd, "\\windows\\start menu") == 0) + strcpy(cwd, "\\Apps"); + } + if(buffer) + strncpy(buffer, cwd, maxlen); + return cwd; +} +#undef getcwd +#define getcwd getcwd_ce +#endif + #endif diff --git a/src/htsmd5.c b/src/htsmd5.c index 47242d8..92aec5e 100644 --- a/src/htsmd5.c +++ b/src/htsmd5.c @@ -39,6 +39,9 @@ Please visit our Website: http://www.httrack.com /* Modified 2000 by Xavier Roche for domd5mem */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsmd5.h" #include "md5.h" #include @@ -48,12 +51,25 @@ int domd5mem(unsigned char * buf, int len, unsigned char * digest, int asAscii) { int endian = 1; unsigned char bindigest[16]; +#if 1 +//#ifndef _WIN32_WCE MD5_CTX ctx; MD5Init(&ctx, * ( (char*) &endian)); MD5Update(&ctx, buf, len); MD5Final(bindigest, &ctx); - +#else + /* Broken md5.. temporary hack */ + int i; + memset(bindigest, 0, 16); + if (len > 0) { + for(i = 0 ; i < len + 16 ; i++) { + bindigest[i % 16] ^= ( buf[i % len] + i + len ); + bindigest[(i - 1) % 16] ^= bindigest[ ( i + buf[i % len]*buf[(i-1) % len] ) % 16]; + } + } +#endif + if (!asAscii) { memcpy(digest, bindigest, 16); } else { @@ -70,7 +86,8 @@ int domd5mem(unsigned char * buf, int len, } unsigned long int md5sum32(char* buff) { - char digest[16]; - domd5mem(buff,strlen(buff),digest,0); - return *( (long int*)(char*)digest ); + unsigned char md5digest[16]; + unsigned char* md5digest_ = md5digest; + domd5mem(buff,strlen(buff),md5digest,0); + return *( (long int*)(char*)md5digest ); } diff --git a/src/htsmd5.h b/src/htsmd5.h index 84148bd..3e3b00c 100644 --- a/src/htsmd5.h +++ b/src/htsmd5.h @@ -42,9 +42,12 @@ Please visit our Website: http://www.httrack.com #ifndef HTSMD5_DEFH #define HTSMD5_DEFH +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE int domd5mem(unsigned char * buf, int len, unsigned char * digest, int asAscii); unsigned long int md5sum32(char* buff); +#endif #endif diff --git a/src/htsmodules.c b/src/htsmodules.c index 27ab855..3299c41 100644 --- a/src/htsmodules.c +++ b/src/htsmodules.c @@ -35,21 +35,20 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ -#ifndef _WIN32 -#if HTS_DLOPEN -#include -#endif -#endif - -#include -#include -#include +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE #include "htsglobal.h" #include "htsmodules.h" #include "htsopt.h" extern int fspc(FILE* fp,char* type); +#ifndef _WIN32 +#if HTS_DLOPEN +#include +#endif +#endif + /* >>> Put all modules definitions here */ #include "htszlib.h" #include "htsbase.h" @@ -71,9 +70,11 @@ t_hts_detect_swf hts_detect_swf = NULL; t_hts_parse_swf hts_parse_swf = NULL; int gz_is_available = 0; +#if 0 t_gzopen gzopen = NULL; t_gzread gzread = NULL; t_gzclose gzclose = NULL; +#endif int SSL_is_available = 0; t_SSL_shutdown SSL_shutdown = NULL; @@ -108,6 +109,7 @@ void abortLog__fnc(char* msg, char* file, int line) { FILE* fp = fopen("CRASH.TXT", "wb"); if (!fp) fp = fopen("/tmp/CRASH.TXT", "wb"); if (!fp) fp = fopen("C:\\CRASH.TXT", "wb"); + if (!fp) fp = fopen("CRASH.TXT", "wb"); if (fp) { fprintf(fp, "HTTrack " HTTRACK_VERSIONID " closed at '%s', line %d\r\n", file, line); fprintf(fp, "Reason:\r\n%s\r\n", msg); @@ -144,17 +146,60 @@ int hts_parse_externals(htsmoduleStruct* str) { return -1; } -/* NOTE: handled NOT closed */ -void* getFunctionPtr(char* file_, char* fncname) { - char file[1024]; +static void addCallback(htscallbacks* chain, void* moduleHandle, htscallbacksfncptr exitFnc) { + while(chain->next != NULL) { + chain = chain->next; + } + chain->next = calloct(1, sizeof(htscallbacks)); + assertf(chain->next != NULL); + chain = chain->next; + memset(chain, 0, sizeof(*chain)); + chain->exitFnc = exitFnc; + chain->moduleHandle = moduleHandle; +} + +void clearCallbacks(htscallbacks* chain_); +void clearCallbacks(htscallbacks* chain_) { + htscallbacks* chain; + chain = chain_; + while(chain != NULL) { + if (chain->exitFnc != NULL) { + (void) chain->exitFnc(); /* result ignored */ + chain->exitFnc = NULL; + } + chain = chain->next; + } + chain = chain_; + while(chain != NULL) { + if (chain->moduleHandle != NULL) { +#ifdef _WIN32 + FreeLibrary(chain->moduleHandle); +#else + dlclose(chain->moduleHandle); +#endif + } + chain = chain->next; + } + chain = chain_->next; // Don't free the block #0 + while(chain != NULL) { + htscallbacks* nextchain = chain->next; + freet(chain); + chain = nextchain; + } + chain_->next = NULL; // Empty +} + +void* getFunctionPtr(httrackp* opt, char* file_, char* fncname); +void* getFunctionPtr(httrackp* opt, char* file_, char* fncname) { + char BIGSTK file[1024]; void* handle; void* userfunction = NULL; strcpybuff(file, file_); #ifdef _WIN32 - handle = LoadLibrary(file); + handle = LoadLibraryA((char*)file); if (handle == NULL) { strcatbuff(file, ".dll"); - handle = LoadLibrary(file); + handle = LoadLibraryA((char*)file); } #else handle = dlopen(file, RTLD_LAZY); @@ -164,13 +209,61 @@ void* getFunctionPtr(char* file_, char* fncname) { } #endif if (handle) { - userfunction = (void*) DynamicGet(handle, fncname); + /* Thanks to Lars Clausen for the "wrapper-init" patch */ + /* If given arguments, call "_init" */ + char BIGSTK tmpName[1024]; + char *comma; + if ((comma = strchr(fncname, ',')) != NULL) { /* empty arg */ + *comma++ = '\0'; + } + + /* speficic plug init */ + { + t_htsWrapperPlugInit initfunction; + sprintf(tmpName, "%s_init", fncname); + initfunction = (t_htsWrapperPlugInit)DynamicGet(handle, (char*)tmpName); + if (initfunction != NULL) { + int result = (int) initfunction(comma); + if (!result) { + if (userfunction == NULL) { +#ifdef _WIN32 + FreeLibrary(handle); +#else + dlclose(handle); +#endif + } + return NULL; + } + } + } + /* wrapper_init() */ + { + t_htsWrapperInit initfunction = (t_htsWrapperInit)DynamicGet(handle, (char*)"wrapper_init"); + if (initfunction != NULL) { + if (! initfunction(fncname, comma)) { + if (userfunction == NULL) { +#ifdef _WIN32 + FreeLibrary(handle); +#else + dlclose(handle); +#endif + } + return NULL; + } + } + } + /* the function itself */ + userfunction = (void*) DynamicGet(handle, (char*)fncname); if (userfunction == NULL) { #ifdef _WIN32 FreeLibrary(handle); #else dlclose(handle); #endif + } else { + /* optional exit wrapper */ + t_htsWrapperExit exitFnc = (t_htsWrapperExit) DynamicGet(handle, (char*)"wrapper_exit"); + addCallback(&opt->state.callbacks, handle, exitFnc); // exitFnc can be null } } return userfunction; @@ -183,7 +276,10 @@ void htspe_init() { /* >>> Put all module initializations here */ + /* Zlib */ + gz_is_available = 1; + /* #if HTS_DLOPEN { void* handle; @@ -202,13 +298,14 @@ void htspe_init() { } } #endif + */ /* OpenSSL */ #if HTS_DLOPEN { void* handle; #ifdef _WIN32 - handle = LoadLibrary("ssleay32"); + handle = LoadLibraryA((char*)"ssleay32"); #else /* We are compatible with 0.9.6/7 and potentially above */ handle = dlopen("libssl.so.0.9.7", RTLD_LAZY); @@ -221,27 +318,27 @@ void htspe_init() { } #endif if (handle) { - SSL_shutdown = (t_SSL_shutdown) DynamicGet(handle, "SSL_shutdown"); - SSL_free = (t_SSL_free) DynamicGet(handle, "SSL_free"); - SSL_new = (t_SSL_new) DynamicGet(handle, "SSL_new"); - SSL_clear = (t_SSL_clear) DynamicGet(handle, "SSL_clear"); - SSL_set_fd = (t_SSL_set_fd) DynamicGet(handle, "SSL_set_fd"); - SSL_set_connect_state = (t_SSL_set_connect_state) DynamicGet(handle, "SSL_set_connect_state"); - SSL_connect = (t_SSL_connect) DynamicGet(handle, "SSL_connect"); - SSL_get_error = (t_SSL_get_error) DynamicGet(handle, "SSL_get_error"); - SSL_write = (t_SSL_write) DynamicGet(handle, "SSL_write"); - SSL_read = (t_SSL_read) DynamicGet(handle, "SSL_read"); - SSL_library_init = (t_SSL_library_init) DynamicGet(handle, "SSL_library_init"); - ERR_load_SSL_strings = (t_ERR_load_SSL_strings) DynamicGet(handle, "ERR_load_SSL_strings"); - SSLv23_client_method = (t_SSLv23_client_method) DynamicGet(handle, "SSLv23_client_method"); - SSL_CTX_new = (t_SSL_CTX_new) DynamicGet(handle, "SSL_CTX_new"); - SSL_load_error_strings = (t_SSL_load_error_strings) DynamicGet(handle, "SSL_load_error_strings"); - SSL_CTX_ctrl = (t_SSL_CTX_ctrl) DynamicGet(handle, "SSL_CTX_ctrl"); + SSL_shutdown = (t_SSL_shutdown) DynamicGet(handle, (char*)"SSL_shutdown"); + SSL_free = (t_SSL_free) DynamicGet(handle, (char*)"SSL_free"); + SSL_new = (t_SSL_new) DynamicGet(handle, (char*)"SSL_new"); + SSL_clear = (t_SSL_clear) DynamicGet(handle, (char*)"SSL_clear"); + SSL_set_fd = (t_SSL_set_fd) DynamicGet(handle, (char*)"SSL_set_fd"); + SSL_set_connect_state = (t_SSL_set_connect_state) DynamicGet(handle, (char*)"SSL_set_connect_state"); + SSL_connect = (t_SSL_connect) DynamicGet(handle, (char*)"SSL_connect"); + SSL_get_error = (t_SSL_get_error) DynamicGet(handle, (char*)"SSL_get_error"); + SSL_write = (t_SSL_write) DynamicGet(handle, (char*)"SSL_write"); + SSL_read = (t_SSL_read) DynamicGet(handle, (char*)"SSL_read"); + SSL_library_init = (t_SSL_library_init) DynamicGet(handle, (char*)"SSL_library_init"); + ERR_load_SSL_strings = (t_ERR_load_SSL_strings) DynamicGet(handle, (char*)"ERR_load_SSL_strings"); + SSLv23_client_method = (t_SSLv23_client_method) DynamicGet(handle, (char*)"SSLv23_client_method"); + SSL_CTX_new = (t_SSL_CTX_new) DynamicGet(handle, (char*)"SSL_CTX_new"); + SSL_load_error_strings = (t_SSL_load_error_strings) DynamicGet(handle, (char*)"SSL_load_error_strings"); + SSL_CTX_ctrl = (t_SSL_CTX_ctrl) DynamicGet(handle, (char*)"SSL_CTX_ctrl"); #ifdef _WIN32 - handle = LoadLibrary("libeay32"); + handle = LoadLibraryA((char*)"libeay32"); #endif - ERR_load_crypto_strings = (t_ERR_load_crypto_strings) DynamicGet(handle, "ERR_load_crypto_strings"); - ERR_error_string = (t_ERR_error_string) DynamicGet(handle, "ERR_error_string"); + ERR_load_crypto_strings = (t_ERR_load_crypto_strings) DynamicGet(handle, (char*)"ERR_load_crypto_strings"); + ERR_error_string = (t_ERR_error_string) DynamicGet(handle, (char*)"ERR_error_string"); if (SSL_shutdown && SSL_free && SSL_CTX_ctrl && SSL_new && SSL_clear && SSL_set_fd && SSL_set_connect_state && SSL_connect && SSL_get_error && SSL_write @@ -262,7 +359,7 @@ void htspe_init() { #if HTS_DLOPEN { #ifdef _WIN32 - void* handle = LoadLibrary("htsswf"); + void* handle = LoadLibraryA((char*)"htsswf"); #else void* handle = dlopen("libhtsswf.so.1", RTLD_LAZY); #endif @@ -300,6 +397,7 @@ static void htspe_log(htsmoduleStruct* str, char* msg) { } } +HTSEXT_API const char* hts_is_available(void); HTSEXT_API const char* hts_is_available(void) { return WHAT_is_available; } diff --git a/src/htsmodules.h b/src/htsmodules.h index 7d1154b..5d2b989 100644 --- a/src/htsmodules.h +++ b/src/htsmodules.h @@ -98,14 +98,22 @@ struct htsmoduleStruct { }; +/* Used to wrap module initialization */ +/* return 1 if init was ok */ +typedef int (*t_htsWrapperInit)(char *fn, char *args); +typedef int (*t_htsWrapperExit)(void); +typedef int (*t_htsWrapperPlugInit)(char *args); + +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE extern void htspe_init(void); extern int hts_parse_externals(htsmoduleStruct* str); -extern void* getFunctionPtr(char* file, char* fncname); extern int gz_is_available; extern int swf_is_available; extern int SSL_is_available; extern int V6_is_available; extern char WHAT_is_available[64]; +#endif #endif diff --git a/src/htsname.c b/src/htsname.c index 56fa6a6..8af2062 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -35,14 +35,15 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsname.h" /* specific definitions */ #include "htsbase.h" #include "htstools.h" #include "htsmd5.h" -#include -#include #include /* END specific definitions */ @@ -51,7 +52,7 @@ Please visit our Website: http://www.httrack.com #define ADD_STANDARD_PATH \ { /* ajout nom */\ - char buff[HTS_URLMAXSIZE*2];\ + char BIGSTK buff[HTS_URLMAXSIZE*2];\ buff[0]='\0';\ strncatbuff(buff,start_pos,(int) (nom_pos - start_pos));\ url_savename_addstr(save,buff);\ @@ -59,7 +60,7 @@ Please visit our Website: http://www.httrack.com #define ADD_STANDARD_NAME(shortname) \ { /* ajout nom */\ - char buff[HTS_URLMAXSIZE*2];\ + char BIGSTK buff[HTS_URLMAXSIZE*2];\ standard_name(buff,dot_pos,nom_pos,fil_complete,(shortname));\ url_savename_addstr(save,buff);\ } @@ -78,13 +79,38 @@ static const char *hts_tbdev[] = }; +#define URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET() do { \ + int prev = _hts_in_html_parsing; \ + while(back_pluggable_sockets_strict(back, back_max, opt) <= 0) { \ + _hts_in_html_parsing = 6; \ + /* Wait .. */ \ + back_wait(back,back_max,opt,cache,0); \ + /* Transfer rate */ \ + engine_stats(); \ + /* Refresh various stats */ \ + HTS_STAT.stat_nsocket=back_nsoc(back,back_max); \ + HTS_STAT.stat_errors=fspc(NULL,"error"); \ + HTS_STAT.stat_warnings=fspc(NULL,"warning"); \ + HTS_STAT.stat_infos=fspc(NULL,"info"); \ + HTS_STAT.nbk=backlinks_done(liens,lien_tot,ptr); \ + HTS_STAT.nb=back_transfered(HTS_STAT.stat_bytes,back,back_max); \ + /* Check */ \ + if (!hts_htmlcheck_loop(back,back_max,-1,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { \ + return -1; \ + } \ + } \ + _hts_in_html_parsing = prev; \ +} while(0) + // forme le nom du fichier à sauver (save) à partir de fil et adr // système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html) int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_adr,char* former_fil,char* referer_adr,char* referer_fil,httrackp* opt,lien_url** liens,int lien_tot,lien_back* back,int back_max,cache_back* cache,hash_struct* hash,int ptr,int numero_passe) { - char newfil[HTS_URLMAXSIZE*2]; /* ="" */ - /*char normadr_[HTS_URLMAXSIZE*2];*/ - char normfil_[HTS_URLMAXSIZE*2]; + char BIGSTK newfil[HTS_URLMAXSIZE*2]; /* ="" */ + /*char BIGSTK normadr_[HTS_URLMAXSIZE*2];*/ + char BIGSTK normadr_[HTS_URLMAXSIZE*2], normfil_[HTS_URLMAXSIZE*2]; + int protocol = 0; + static const char* protocol_str[] = {"http", "https", "ftp", "file", "unknown"}; char* normadr; char* normfil; char* fil; @@ -100,11 +126,11 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a /* 8-3 ? */ switch(opt->savename_83) { - case 1: + case 1: // 8-3 max_char=8; break; - case 2: - max_char=30; + case 2: // Level 2 File names may be up to 31 characters. + max_char=31; break; default: max_char=8; @@ -130,13 +156,33 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // www-42.foo.com -> foo.com // foo.com/bar//foobar -> foo.com/bar/foobar if (opt->urlhack) { - // copy of adr (withiotu protocol), used for lookups (see urlhack) - normadr=jump_normalized(adr); + // copy of adr (without protocol), used for lookups (see urlhack) + normadr=adr_normalized(adr, normadr_); normfil=fil_normalized(fil,normfil_); + } else { + if (link_has_authority(adr_complete)) { // https or other protocols : in "http/" subfolder + char* pos = strchr(adr_complete, ':'); + if (pos != NULL) { + normadr_[0] = '\0'; + strncatbuff(normadr_, adr_complete, (int)(pos - adr_complete)); + strcatbuff(normadr_, "://"); + strcatbuff(normadr_, normadr); + normadr=normadr_; + } + } } // à afficher sans ftp:// print_adr=jump_protocol(adr); + if (strfield(adr_complete, "https:")) { + protocol = 1; + } else if (strfield(adr_complete, "ftp:")) { + protocol = 2; + } else if (strfield(adr_complete, "file:")) { + protocol = 3; + } else { + protocol = 0; + } // court-circuit pour lien primaire if (strnotempty(adr)==0) { @@ -199,7 +245,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // chercher sans / ou avec / dans former { - char fil_complete_patche[HTS_URLMAXSIZE*2]; + char BIGSTK fil_complete_patche[HTS_URLMAXSIZE*2]; strcpybuff(fil_complete_patche,normfil); // Version avec ou sans / if (fil_complete_patche[strlen(fil_complete_patche)-1]=='/') @@ -254,30 +300,13 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a } // décoder % strcpybuff(fil,unescape_http(fil)); - /* - { - char tempo[HTS_URLMAXSIZE*2]; - int i,j=0; - for (i=0;i<(int) strlen(fil);i++) { - if (fil[i]=='%') { - i++; - tempo[j++]=(char) ehex(fil+i); - i++; // sauter 2 caractères finalement - } else - tempo[j++]=fil[i]; - } - tempo[j++]='\0'; - strcpybuff(fil,tempo); - } - */ - - + /* replace shtml to html.. */ switch (ishtml(fil)) { /* .html,.shtml,.. */ case 1: if ( - (strcmp(get_ext(fil),"html") != 0) - && (strcmp(get_ext(fil),"htm") != 0) + (strfield2(get_ext(fil),"html") == 0) + && (strfield2(get_ext(fil),"htm") == 0) ) { strcpybuff(ext,"html"); ext_chg=1; @@ -286,7 +315,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a case 0: if (!strnotempty(ext)) { if (is_userknowntype(get_ext(fil))) { // mime known by user - char mime[1024]; + char BIGSTK mime[1024]; mime[0]=ext[0]='\0'; get_userhttptype(0,mime,get_ext(fil)); if (strnotempty(mime)) { @@ -330,13 +359,35 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a } } // + } else if (is_userknowntype(fil)) { /* PATCH BY BRIAN SCHRÖDER. + Lookup mimetype not only by extension, + but also by filename */ + /* Note: "foo.cgi => text/html" means that foo.cgi shall have the text/html MIME file type, + that is, ".html" */ + char BIGSTK mime[1024]; + mime[0]=ext[0]='\0'; + get_userhttptype(0, mime, fil); + if (strnotempty(mime)) { + give_mimext(ext, mime); + if (strnotempty(ext)) { + ext_chg=1; + } + } } else { // test imposible dans le cache, faire une requête // #if HTS_ANALYSTE int hihp=_hts_in_html_parsing; #endif int has_been_moved=0; - char curr_adr[HTS_URLMAXSIZE*2],curr_fil[HTS_URLMAXSIZE*2]; + char BIGSTK curr_adr[HTS_URLMAXSIZE*2],curr_fil[HTS_URLMAXSIZE*2]; + + /* Ensure we don't use too many sockets by using a "testing" one + If we have only 1 simultaneous connection authorized, wait for pending download + Wait for an available slot + */ + URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET(); + + /* Rock'in */ curr_adr[0]=curr_fil[0]='\0'; #if HTS_ANALYSTE _hts_in_html_parsing=2; // test @@ -383,7 +434,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a if (!hts_htmlcheck_loop(back,back_max,b,ptr,lien_tot,(int) (time_local()-HTS_STAT.stat_timestart),&HTS_STAT)) { return -1; } else if (_hts_cancel || !back_checkmirror(opt)) { // cancel 2 ou 1 (cancel parsing) - back_delete(opt,back,b); // cancel test + back_delete(opt,cache,back,b); // cancel test stop_looping = 1; } } @@ -399,7 +450,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a ) { // agh moved.. un tit tour de plus if ((petits_tours<5) && (former_adr) && (former_fil)) { // on va pas tourner en rond non plus! if ((int) strnotempty(back[b].r.location)) { // location existe! - char mov_url[HTS_URLMAXSIZE*2],mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2]; + char BIGSTK mov_url[HTS_URLMAXSIZE*2],mov_adr[HTS_URLMAXSIZE*2],mov_fil[HTS_URLMAXSIZE*2]; mov_url[0]=mov_adr[0]=mov_fil[0]='\0'; // strcpybuff(mov_url,back[b].r.location); // copier URL @@ -424,11 +475,12 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a robots_wizard* robots = (robots_wizard*) opt->robotsptr; if (hts_acceptlink(opt,ptr,lien_tot,liens, mov_adr,mov_fil, + NULL, NULL, &set_prio_to, NULL) == 1) { /* forbidden */ has_been_moved = 1; - back_maydelete(opt,back,b); // ok + back_maydelete(opt,cache,back,b); // ok strcpybuff(curr_adr,mov_adr); strcpybuff(curr_fil,mov_fil); mov_url[0]='\0'; @@ -439,7 +491,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // ftp: stop! if (strfield(mov_url,"ftp://")) { // ftp, ok on arrête has_been_moved = 1; - back_maydelete(opt,back,b); // ok + back_maydelete(opt,cache,back,b); // ok strcpybuff(curr_adr,mov_adr); strcpybuff(curr_fil,mov_fil); stop_looping = 1; @@ -455,6 +507,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a } } // Ajouter + URLSAVENAME_WAIT_FOR_AVAILABLE_SOCKET(); if (back_add(back,back_max,opt,cache,mov_adr,mov_fil,methode,referer_adr,referer_fil,1,NULL)!=-1) { // OK if ( (opt->debug>1) && (opt->errlog!=NULL) ) { fspc(opt->errlog,"warning"); fprintf(opt->errlog,"(during prefetch) %s (%d) to link %s at %s%s"LF,back[b].r.msg,back[b].r.statuscode,back[b].r.location,curr_adr,curr_fil); @@ -462,7 +515,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a } // libérer emplacement backing actuel et attendre le prochain - back_maydelete(opt,back,b); + back_maydelete(opt,cache,back,b); strcpybuff(curr_adr,mov_adr); strcpybuff(curr_fil,mov_fil); b=back_index(back,back_max,curr_adr,curr_fil,methode); @@ -507,7 +560,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a fspc(opt->errlog,0); fprintf(opt->errlog,"Error: (during prefetch) %s (%d) to link %s at %s%s"LF,back[b].r.msg,back[b].r.statuscode,back[b].r.location,curr_adr,curr_fil); test_flush; } - back_delete(opt,back,b); + back_delete(opt,cache,back,b); return -1; // ERREUR (404 par exemple) */ } @@ -531,7 +584,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // FIN Si non déplacé, forcer type? // libérer emplacement backing - back_maydelete(opt,back,b); + back_maydelete(opt,cache,back,b); // --- --- --- // oops, a été déplacé.. on recalcule en récursif (osons!) @@ -787,7 +840,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a if (!short_ver) { // Noms longs strncatbuff(b,fil,(int) (nom_pos - fil) - 1); } else { - char pth[HTS_URLMAXSIZE*2],n83[HTS_URLMAXSIZE*2]; + char BIGSTK pth[HTS_URLMAXSIZE*2],n83[HTS_URLMAXSIZE*2]; pth[0]=n83[0]='\0'; // strncatbuff(pth,fil,(int) (nom_pos - fil) - 1); @@ -816,7 +869,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a *b='\0'; { char digest[32+2]; - char buff[HTS_URLMAXSIZE*2]; + char BIGSTK buff[HTS_URLMAXSIZE*2]; digest[0]=buff[0]='\0'; strcpybuff(buff,adr); strcatbuff(buff,fil_complete); @@ -831,6 +884,11 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a strncatbuff(b,url_md5(fil_complete),(tok == 'Q')?32:4); b+=strlen(b); // pointer à la fin break; + case 'r': case 'R': // protocol + *b='\0'; + strcatbuff(b, protocol_str[protocol]); + b+=strlen(b); // pointer à la fin + break; } } else *b++=*a++; @@ -1044,7 +1102,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a { char* a=jump_identification(save); if (a!=save) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; char *b; tempo[0]='\0'; strcpybuff(tempo,"["); @@ -1061,7 +1119,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // éviter les / au début (cause: N100) if (save[0]=='/') { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; strcpybuff(tempo,save+1); strcpybuff(save,tempo); } @@ -1110,7 +1168,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a case '/': case '.': { - char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncatbuff(tempo,save,(int) (a - save) + strlen(hts_tbdev[i])); strcatbuff(tempo,"_"); strcatbuff(tempo,a+strlen(hts_tbdev[i])); @@ -1123,15 +1181,57 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a i++; } } + /* Strip ending . or ' ' forbidden on windoz */ + { + int len; + char* a=save; + while((a=strstr(a,"./"))) { + *a = '_'; + } + a=save; + while((a=strstr(a," /"))) { + *a = '_'; + } + len = (int) strlen(save); + if (len > 0 && ( save[len - 1] == '.' || save[len - 1] == ' ') ) { + save[len - 1] = '_'; + } + } #endif // conversion 8-3 .. y compris pour les répertoires if (opt->savename_83) { - char n83[HTS_URLMAXSIZE*2]; + char BIGSTK n83[HTS_URLMAXSIZE*2]; long_to_83(opt->savename_83,n83,save); strcpybuff(save,n83); } + // enforce stricter ISO9660 compliance (bug reported by Steffo Carlsson) + // Level 1 File names are restricted to 8 characters with a 3 character extension, + // upper case letters, numbers and underscore; maximum depth of directories is 8. + // This will be our "DOS mode" + // L2: 31 characters + // A-Z,0-9,_ + if (opt->savename_83 > 0) { + char *a, *last; + for(last = save + strlen(save) - 1 ; last != save && *last != '/' && *last != '\\' && *last != '.' ; last--); + if (*last != '.') { + last = NULL; + } + for(a = save ; *a != '\0' ; a++) { + if (*a >= 'a' && *a <= 'z') { + *a -= 'a' - 'A'; + } + else if (*a == '.') { + if (a != last) { + *a = '_'; + } + } + else if ( ! ( (*a >= 'A' && *a <= 'Z') || (*a >= '0' && *a <= '9') || *a == '_' || *a == '/' || *a == '\\') ) { + *a = '_'; + } + } + } /* ensure that there is no ../ (potential vulnerability) */ fil_simplifie(save); @@ -1148,7 +1248,7 @@ int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_a // chemin primaire éventuel A METTRE AVANT if (strnotempty(opt->path_html)) { - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; strcpybuff(tempo,opt->path_html); strcatbuff(tempo,save); strcpybuff(save,tempo); @@ -1189,17 +1289,22 @@ printf("%cParse: %d",13,i); #if HTS_CASSE if ((strcmp(liens[i]->adr,adr)==0) && (strcmp(liens[i]->fil,fil_complete)==0)) #else - if ((strfield2(liens[i]->adr,adr)) && (strfield2(liens[i]->fil,fil_complete))) + if ((strfield2(liens[i]->adr, normadr)) && (strfield2(liens[i]->fil, normfil))) + //if ((strfield2(liens[i]->adr,adr)) && (strfield2(liens[i]->fil,fil_complete))) #endif { // ok c'est le même lien, adresse déja définie - //printf("Ok, %s\n",save); - //i=lien_tot; // sortir + /* Take the existing name not to screw up with cAsE sEnSiTiViTy of Linux/Unix */ + if (strcmp(liens[i]->sav, save) != 0) { + strcpybuff(save, liens[i]->sav); + } i=0; #if DEBUG_SAVENAME printf("\nOK ALREADY DEFINED\n",13,i); +#endif +#if HTS_CASSE #endif } else { // utilisé par un AUTRE, changer de nom - char tempo[HTS_URLMAXSIZE*2]; + char BIGSTK tempo[HTS_URLMAXSIZE*2]; char* a=save+strlen(save)-1; char* b; int n=2; @@ -1310,7 +1415,7 @@ char* url_md5(char* fil_complete) { a=strchr(fil_complete,'?'); if (a) { if (strlen(a)) { - char buff[HTS_URLMAXSIZE*2]; + char BIGSTK buff[HTS_URLMAXSIZE*2]; a++; digest[0]=buff[0]='\0'; strcatbuff(buff,a); /* query string MD5 */ diff --git a/src/htsname.h b/src/htsname.h index aae5f99..61ed1de 100644 --- a/src/htsname.h +++ b/src/htsname.h @@ -42,9 +42,12 @@ Please visit our Website: http://www.httrack.com #include "htscore.h" +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE int url_savename(char* adr_complete,char* fil_complete,char* save,char* former_adr,char* former_fil,char* referer_adr,char* referer_fil,httrackp* opt,lien_url** liens,int lien_tot,lien_back* back,int back_max,cache_back* cache,hash_struct* hash,int ptr,int numero_passe); void standard_name(char* b,char* dot_pos,char* nom_pos,char* fil_complete,int short_ver); void url_savename_addstr(char* d,char* s); char* url_md5(char* fil_complete); +#endif #endif diff --git a/src/htsnet.h b/src/htsnet.h index dbdbcc6..7b7cc1a 100644 --- a/src/htsnet.h +++ b/src/htsnet.h @@ -45,7 +45,9 @@ Please visit our Website: http://www.httrack.com #include #if HTS_WIN // pour read +#ifndef _WIN32_WCE #include +#endif // pour FindFirstFile #include #else @@ -71,12 +73,6 @@ Please visit our Website: http://www.httrack.com #ifndef HTS_DO_NOT_REDEFINE_in_addr_t typedef unsigned long in_addr_t; #endif -#undef min -#undef max -#undef Sleep -#define min(a,b) ((a)>(b)?(b):(a)) -#define max(a,b) ((a)>(b)?(a):(b)) -#define Sleep(a) { if (((a)*1000)%1000000) usleep(((a)*1000)%1000000); if (((a)*1000)/1000000) sleep(((a)*1000)/1000000); } #endif /* @@ -87,7 +83,7 @@ Please visit our Website: http://www.httrack.com /* Ipv4 structures */ typedef struct in_addr INaddr; /* This should handle all cases */ -typedef struct { +typedef struct SOCaddr { union { struct sockaddr_in in; struct sockaddr sa; @@ -155,7 +151,7 @@ strcpy(namebuf, dot); \ /* Ipv4 structures */ typedef struct in6_addr INaddr; /* This should handle all cases */ -typedef struct { +typedef struct SOCaddr { union { struct sockaddr_in6 in6; struct sockaddr_in in; @@ -236,7 +232,7 @@ getnameinfo((struct sockaddr *)&(ss), sslen, \ #endif /* Buffer structure to copy various hostent structures */ -typedef struct { +typedef struct t_fullhostent { t_hostent hp; char* list[2]; char addr[HTS_MAXADDRLEN]; /* various struct sockaddr structures */ diff --git a/src/htsnostatic.c b/src/htsnostatic.c index eff6184..22e7d7a 100644 --- a/src/htsnostatic.c +++ b/src/htsnostatic.c @@ -35,13 +35,16 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE + #include "htsnostatic.h" #include "htsbase.h" #include "htshash.h" #include "htsinthash.h" -typedef struct { +typedef struct hts_varhash { /* inthash values; */ diff --git a/src/htsnostatic.h b/src/htsnostatic.h index f24f0ad..3bf4ec9 100644 --- a/src/htsnostatic.h +++ b/src/htsnostatic.h @@ -53,21 +53,12 @@ Please visit our Website: http://www.httrack.com #ifndef HTSNOSTATIC_DEFH #define HTSNOSTATIC_DEFH +/* Library internal definictions */ +#ifdef HTS_INTERNAL_BYTECODE + #include "htscore.h" #include "htsthread.h" -/* -#if USE_PTHREAD -#if HTS_WIN -#undef HTS_REENTRANT -#else -#define HTS_REENTRANT -#endif -#else -#undef HTS_REENTRANT -#endif -*/ - #define HTS_VAR_MAIN_HASH 127 /* @@ -157,7 +148,61 @@ void hts_destroyvar_key(void* adr); &cKey */ -#if HTS_WIN +#ifdef _WIN32 + +#ifdef _WIN32_WCE + +/* Windows CE: static only */ +#define NOSTATIC_XRESERVE(name, type, nelt) do { \ + /*__declspec( thread )*/ static type thValue[nelt]; \ + /* __declspec( thread ) */ int static initValue = 0; \ + name = thValue; \ + if (!initValue) { \ + initValue = 1; \ + memset(&thValue, 0, sizeof(thValue)); \ + } \ +} while(0) + +#elif 1 + +/* New Windows version: TLS */ +/* Suggested by daan at zwif.com to be more gentle with LoadLibrary (04/2004) +See http://msdn.microsoft.com/library/en-us/vccore/html/_core_rules_and_limitations_for_tls.asp +And especially the "DLL declares any nonlocal data or object as __declspec( thread )" section +*/ +#define NOSTATIC_XRESERVE(name,type,nelt) do { \ + static DWORD tlsIndex = 0; \ + static int initValue = 0; \ + if (initValue == 0) \ + { \ + if (!hts_maylockvar()) { \ + abortLog("unable to lock mutex (not initialized?!)"); \ + abort(); \ + } \ + hts_lockvar(); \ + if (initValue == 0) { \ + tlsIndex = TlsAlloc(); \ + if (tlsIndex == 0xFFFFFFFF) { \ + abortLog("unable to allocate thread local storage (TLS) for variable!"); \ + abort(); \ + } \ + initValue = 1; \ + } \ + hts_unlockvar(); \ + } \ + name = (type*)TlsGetValue(tlsIndex); \ + if (name == NULL) { \ + name = (type*)malloc(sizeof(type)*nelt); \ + if (name == NULL) { \ + abortLog("unable to allocate memory for variable!"); \ + abort(); \ + } \ + memset(name, 0, sizeof(type)*nelt); \ + TlsSetValue(tlsIndex, name); \ + } \ +} while(0) + +#else /* Windows: handled by the compiler */ #define NOSTATIC_XRESERVE(name, type, nelt) do { \ @@ -170,6 +215,8 @@ void hts_destroyvar_key(void* adr); } \ } while(0) +#endif + #else /* Un*x : slightly more complex, we have to create a thread-key */ @@ -227,3 +274,5 @@ else { \ #endif #endif + +#endif diff --git a/src/htsopt.h b/src/htsopt.h index 77910b6..3328ce0 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -44,7 +44,7 @@ Please visit our Website: http://www.httrack.com #include "htsbauth.h" // structure proxy -typedef struct { +typedef struct t_proxy { int active; char name[1024]; int port; @@ -52,14 +52,24 @@ typedef struct { } t_proxy; /* Structure utile pour copier en bloc les paramètres */ -typedef struct { +typedef struct htsfilters { char*** filters; int* filptr; //int* filter_max; } htsfilters; +/* User callbacks chain */ +typedef int (*htscallbacksfncptr)(void); +typedef struct htscallbacks htscallbacks; +struct htscallbacks { + char callbackName[128]; + void* moduleHandle; + htscallbacksfncptr exitFnc; + htscallbacks * next; +}; + /* Structure état du miroir */ -typedef struct { +typedef struct htsoptstate { int stop; int exit_xh; int back_add_stats; @@ -67,11 +77,13 @@ typedef struct { int mimehtml_created; char mimemid[256]; FILE* mimefp; + /* */ + htscallbacks callbacks; } htsoptstate; // paramètres httrack (options) -typedef struct { +typedef struct httrackp { int wizard; // wizard aucun/grand/petit int flush; // fflush sur les fichiers log int travel; // type de déplacements (same domain etc) @@ -96,7 +108,7 @@ typedef struct { int rateout; // nombre d'octets minium pour le transfert int maxtime; // temps max en secondes int maxrate; // taux de transfert max - int maxconn; // nombre max de connexions/s + float maxconn; // nombre max de connexions/s int waittime; // démarrage programmé int cache; // génération d'un cache //int aff_progress; // barre de progression @@ -108,6 +120,8 @@ typedef struct { int mimehtml; // MIME-html int user_agent_send; // user agent (ex: httrack/1.0 [sun]) char user_agent[128]; + char referer[256]; // referer + char from[256]; // from char path_log[1024]; // chemin pour cache et log char path_html[1024]; // chemin pour miroir char path_bin[1024]; // chemin pour templates @@ -135,6 +149,7 @@ typedef struct { int urlhack; // force "url normalization" to avoid loops int tolerant; // accepter content-length incorrect int parseall; // essayer de tout parser (tags inconnus contenant des liens, par exemple) + int parsedebug; // débugger parser (debug!) int norecatch; // ne pas reprendre les fichiers effacés localement par l'utilisateur int verbosedisplay; // animation textuelle char footer[256]; // ligne d'infos @@ -156,6 +171,7 @@ typedef struct { // int quiet; // poser des questions autres que wizard? int keyboard; // vérifier stdin + int bypass_limits; // bypass built-in limits // int is_update; // c'est une update (afficher "File updated...") int dir_topindex; // reconstruire top index par la suite @@ -164,7 +180,7 @@ typedef struct { } httrackp; // stats for httrack -typedef struct { +typedef struct hts_stat_struct { LLint HTS_TOTAL_RECV; // flux entrant reçu LLint stat_bytes; // octets écrits sur disque // int HTS_TOTAL_RECV_STATE; // status: 0 tout va bien 1: ralentir un peu 2: ralentir 3: beaucoup @@ -193,6 +209,9 @@ typedef struct { LLint nb; // données transférées actuellement (estimation) // LLint rate; + // + TStamp last_connect; // last connect() call + TStamp last_request; // last request issued } hts_stat_struct; diff --git a/src/htsparse.c b/src/htsparse.c index 3d35252..79cc1cc 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -37,12 +37,12 @@ Please visit our Website: http://www.httrack.com /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE -#include -#include -#include -#include +#ifndef _WIN32_WCE #include +#endif #include /* File defs */ @@ -92,7 +92,7 @@ Please visit our Website: http://www.httrack.com abortLogFmt("not enough memory for current html document in HT_ADD_CHK : realloct(%d) failed" _ ht_size); \ exit(1); \ } \ - } \ +} \ ht_len+=A; #define HT_ADD_ADR \ if ((opt->getmode & 1) && (ptr>0)) { \ @@ -103,11 +103,35 @@ Please visit our Website: http://www.httrack.com } #define HT_ADD(A) \ if ((opt->getmode & 1) && (ptr>0)) { \ - int i=strlen(A),j=ht_len; \ - if (i) { \ - HT_ADD_CHK(i) \ - memcpy(ht_buff+j, A, i); \ - ht_buff[j+i]='\0'; \ + int i_=strlen(A),j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, A, i_); \ + ht_buff[j_+i_]='\0'; \ + } } +#define HT_ADD_HTMLESCAPED(A) \ + if ((opt->getmode & 1) && (ptr>0)) { \ + int i_, j_; \ + char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \ + escape_for_html_print(A, tempo_); \ + i_=strlen(tempo_); \ + j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, tempo_, i_); \ + ht_buff[j_+i_]='\0'; \ + } } +#define HT_ADD_HTMLESCAPED_FULL(A) \ + if ((opt->getmode & 1) && (ptr>0)) { \ + int i_, j_; \ + char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \ + escape_for_html_print_full(A, tempo_); \ + i_=strlen(tempo_); \ + j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, tempo_, i_); \ + ht_buff[j_+i_]='\0'; \ } } #define HT_ADD_START \ int ht_size=(int)(r->size*5)/4+REALLOC_SIZE; \ @@ -126,12 +150,11 @@ Please visit our Website: http://www.httrack.com #define HT_ADD_END { \ int ok=0;\ if (ht_buff) { \ - INTsys file_len=(INTsys) strlen(ht_buff);\ char digest[32+2];\ digest[0]='\0';\ - domd5mem(ht_buff,file_len,digest,1);\ - if (fsize(fconv(savename))==file_len) { \ - int mlen;\ + domd5mem(ht_buff,ht_len,digest,1);\ + if (fsize(fconv(savename))==ht_len) { \ + int mlen = 0;\ char* mbuff;\ cache_readdata(cache,"//[HTML-MD5]//",savename,&mbuff,&mlen);\ if (mlen) mbuff[mlen]='\0';\ @@ -148,8 +171,8 @@ Please visit our Website: http://www.httrack.com if (!ok) { \ fp=filecreate(savename); \ if (fp) { \ - if (file_len>0) {\ - if ((INTsys)fwrite(ht_buff,1,file_len,fp) != file_len) { \ + if (ht_len>0) {\ + if ((INTsys)fwrite(ht_buff,1,ht_len,fp) != ht_len) { \ int fcheck;\ if ((fcheck=check_fatal_io_errno())) {\ opt->state.exit_xh=-1;\ @@ -186,32 +209,32 @@ Please visit our Website: http://www.httrack.com filenote(savename,NULL); \ }\ if (cache->ndx)\ - cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename,digest,(int)strlen(digest));\ + cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename,digest,(int)strlen(digest));\ } \ freet(ht_buff); ht_buff=NULL; \ - } +} #define HT_ADD_FOP // COPY IN HTSCORE.C #define HT_INDEX_END do { \ -if (!makeindex_done) { \ -if (makeindex_fp) { \ - char tempo[1024]; \ + if (!makeindex_done) { \ + if (makeindex_fp) { \ + char BIGSTK tempo[1024]; \ if (makeindex_links == 1) { \ - sprintf(tempo,""CRLF,makeindex_firstlink); \ + sprintf(tempo,""CRLF,makeindex_firstlink); \ } else \ - tempo[0]='\0'; \ + tempo[0]='\0'; \ fprintf(makeindex_fp,template_footer, \ - "", \ - tempo \ - ); \ + "", \ + tempo \ + ); \ fflush(makeindex_fp); \ fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \ makeindex_fp=NULL; \ usercommand(opt,0,NULL,fconcat(opt->path_html,"index.html"),"primary","primary"); \ -} \ -} \ -makeindex_done=1; /* ok c'est fait */ \ + } \ + } \ + makeindex_done=1; /* ok c'est fait */ \ } while(0) // Enregistrement d'un lien: @@ -228,50 +251,50 @@ makeindex_done=1; /* ok c'est fait */ \ // COPIE DE HTSCORE.C #define liens_record(A,F,S,FA,FF) { \ -int notecode=0; \ -int lienurl_len=((sizeof(lien_url)+HTS_ALIGN-1)/HTS_ALIGN)*HTS_ALIGN,\ + int notecode=0; \ + int lienurl_len=((sizeof(lien_url)+HTS_ALIGN-1)/HTS_ALIGN)*HTS_ALIGN,\ adr_len=strlen(A),\ fil_len=strlen(F),\ sav_len=strlen(S),\ cod_len=0,\ former_adr_len=strlen(FA),\ former_fil_len=strlen(FF); \ -if (former_adr_len>0) {\ + if (former_adr_len>0) {\ former_adr_len=(former_adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ former_fil_len=(former_fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ -} else former_adr_len=former_fil_len=0;\ -if (strlen(F)>6) if (strnotempty(codebase)) if (strfield(F+strlen(F)-6,".class")) { notecode=1; \ -cod_len=strlen(codebase); cod_len=(cod_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; } \ -adr_len=(adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; fil_len=(fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; sav_len=(sav_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ -if ((int) lien_size < (int) (adr_len+fil_len+sav_len+cod_len+former_adr_len+former_fil_len+lienurl_len)) { \ -lien_buffer=(char*) ((void*) calloct(add_tab_alloc,1)); \ -lien_size=add_tab_alloc; \ -if (lien_buffer!=NULL) { \ -liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ -liens[lien_tot]->firstblock=1; \ -} \ -} else { \ -liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ -liens[lien_tot]->firstblock=0; \ -} \ -if (liens[lien_tot]!=NULL) { \ -liens[lien_tot]->adr=lien_buffer; lien_buffer+=adr_len; lien_size-=adr_len; \ -liens[lien_tot]->fil=lien_buffer; lien_buffer+=fil_len; lien_size-=fil_len; \ -liens[lien_tot]->sav=lien_buffer; lien_buffer+=sav_len; lien_size-=sav_len; \ -liens[lien_tot]->cod=NULL; \ -if (notecode) { liens[lien_tot]->cod=lien_buffer; lien_buffer+=cod_len; lien_size-=cod_len; strcpybuff(liens[lien_tot]->cod,codebase); } \ -if (former_adr_len>0) {\ -liens[lien_tot]->former_adr=lien_buffer; lien_buffer+=former_adr_len; lien_size-=former_adr_len; \ -liens[lien_tot]->former_fil=lien_buffer; lien_buffer+=former_fil_len; lien_size-=former_fil_len; \ -strcpybuff(liens[lien_tot]->former_adr,FA); \ -strcpybuff(liens[lien_tot]->former_fil,FF); \ -}\ -strcpybuff(liens[lien_tot]->adr,A); \ -strcpybuff(liens[lien_tot]->fil,F); \ -strcpybuff(liens[lien_tot]->sav,S); \ -liens_record_sav_len(liens[lien_tot]); \ -hash_write(hashptr,lien_tot,opt->urlhack); \ -} \ + } else former_adr_len=former_fil_len=0;\ + if (strlen(F)>6) if (strnotempty(codebase)) if (strfield(F+strlen(F)-6,".class")) { notecode=1; \ + cod_len=strlen(codebase); cod_len=(cod_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; } \ + adr_len=(adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; fil_len=(fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; sav_len=(sav_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ + if ((int) lien_size < (int) (adr_len+fil_len+sav_len+cod_len+former_adr_len+former_fil_len+lienurl_len)) { \ + lien_buffer=(char*) ((void*) calloct(add_tab_alloc,1)); \ + lien_size=add_tab_alloc; \ + if (lien_buffer!=NULL) { \ + liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ + liens[lien_tot]->firstblock=1; \ + } \ + } else { \ + liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ + liens[lien_tot]->firstblock=0; \ + } \ + if (liens[lien_tot]!=NULL) { \ + liens[lien_tot]->adr=lien_buffer; lien_buffer+=adr_len; lien_size-=adr_len; \ + liens[lien_tot]->fil=lien_buffer; lien_buffer+=fil_len; lien_size-=fil_len; \ + liens[lien_tot]->sav=lien_buffer; lien_buffer+=sav_len; lien_size-=sav_len; \ + liens[lien_tot]->cod=NULL; \ + if (notecode) { liens[lien_tot]->cod=lien_buffer; lien_buffer+=cod_len; lien_size-=cod_len; strcpybuff(liens[lien_tot]->cod,codebase); } \ + if (former_adr_len>0) {\ + liens[lien_tot]->former_adr=lien_buffer; lien_buffer+=former_adr_len; lien_size-=former_adr_len; \ + liens[lien_tot]->former_fil=lien_buffer; lien_buffer+=former_fil_len; lien_size-=former_fil_len; \ + strcpybuff(liens[lien_tot]->former_adr,FA); \ + strcpybuff(liens[lien_tot]->former_fil,FF); \ + }\ + strcpybuff(liens[lien_tot]->adr,A); \ + strcpybuff(liens[lien_tot]->fil,F); \ + strcpybuff(liens[lien_tot]->sav,S); \ + liens_record_sav_len(liens[lien_tot]); \ + hash_write(hashptr,lien_tot,opt->urlhack); \ + } \ } #define ENGINE_LOAD_CONTEXT() \ @@ -314,32 +337,67 @@ hash_write(hashptr,lien_tot,opt->urlhack); \ #define ENGINE_SAVE_CONTEXT() \ /* Apply changes */ \ - * ( (int*) (str->lien_tot_) ) = lien_tot; \ - * ( (int*) (str->ptr_) ) = ptr; \ - * ( (int*) (str->lien_size_) ) = lien_size; \ - * ( (char**) (str->lien_buffer_) ) = lien_buffer; \ - /* */ \ - * stre->error_ = error; \ - * stre->store_errpage_ = store_errpage; \ - * stre->lien_max_ = lien_max; \ - /* */ \ - *stre->makeindex_done_ = makeindex_done; \ - *stre->makeindex_fp_ = makeindex_fp; \ - *stre->makeindex_links_ = makeindex_links; \ - /* */ \ - *stre->stat_fragment_ = stat_fragment + * ( (int*) (str->lien_tot_) ) = lien_tot; \ + * ( (int*) (str->ptr_) ) = ptr; \ + * ( (int*) (str->lien_size_) ) = lien_size; \ + * ( (char**) (str->lien_buffer_) ) = lien_buffer; \ + /* */ \ + * stre->error_ = error; \ + * stre->store_errpage_ = store_errpage; \ + * stre->lien_max_ = lien_max; \ + /* */ \ + *stre->makeindex_done_ = makeindex_done; \ + *stre->makeindex_fp_ = makeindex_fp; \ + *stre->makeindex_links_ = makeindex_links; \ + /* */ \ + *stre->stat_fragment_ = stat_fragment #define _FILTERS (*opt->filters.filters) #define _FILTERS_PTR (opt->filters.filptr) #define _ROBOTS ((robots_wizard*)opt->robotsptr) +/* Apply current *adr character for the script automate */ +#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \ + if (inscript) { \ + int new_state_pos; \ + new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*adr]; \ + if (new_state_pos < 0) { \ + new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \ + } \ + assertf(new_state_pos >= 0); \ + assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \ + inscript_state_pos=new_state_pos; \ + } \ +} while(0) + +/* Increment current pointer to 'steps' characters, modifying automate if necessary */ +#define INCREMENT_CURRENT_ADR(steps) do { \ + int steps__ = (steps); \ + while(steps__ > 0) { \ + adr++; \ + AUTOMATE_LOOKUP_CURRENT_ADR(); \ + steps__ --; \ + } \ +} while(0) + /* Main parser */ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { /* Load engine variables */ ENGINE_LOAD_CONTEXT(); - + #if HTS_ANALYSTE + { + char* cAddr = r->adr; + int cSize = (int) r->size; + if ( (opt->debug>0) && (opt->log!=NULL) ) { + fspc(opt->log,"info"); fprintf(opt->log,"engine: preprocess-html: %s%s"LF, urladr, urlfil); + } + if (hts_htmlcheck_preprocess(&cAddr, &cSize, urladr, urlfil) == 1) { + r->adr = cAddr; + r->size = cSize; + } + } if (hts_htmlcheck(r->adr,(int)r->size,urladr,urlfil)) { #endif FILE* fp=NULL; // fichier écrit localement @@ -348,8 +406,8 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { if ( (opt->debug>1) && (opt->log!=NULL) ) { fspc(opt->log,"debug"); fprintf(opt->log,"scan file.."LF); test_flush; } - - + + // Indexing! #if HTS_MAKE_KEYWORD_INDEX if (opt->kindex) { @@ -364,13 +422,13 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } } #endif - + // Now, parsing if ((opt->getmode & 1) && (ptr>0)) { // récupérer les html sur disque // créer le fichier html local HT_ADD_FOP; // écrire peu à peu le fichier } - + if (!error) { int detect_title=0; // détection du title int back_add_stats = opt->state.back_add_stats; @@ -410,10 +468,11 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { char* intag_start=adr; char* intag_startattr=NULL; int intag_start_valid=0; + int intag_ctype=0; // int parent_relative=0; // the parent is the base path (.js, .css..) HT_ADD_START; // débuter - + /* Initialize script automate for comments, quotes.. */ memset(inscript_state, 0xff, sizeof(inscript_state)); inscript_state[INSCRIPT_START][INSCRIPT_DEFAULT]=INSCRIPT_START; /* by default, stay in START */ @@ -444,12 +503,12 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { /* statistics */ if ((opt->getmode & 1) && (ptr>0)) { - /* - HTS_STAT.stat_files++; - HTS_STAT.stat_bytes+=r->size; + /* + HTS_STAT.stat_files++; + HTS_STAT.stat_bytes+=r->size; */ } - + /* Primary list or URLs */ if (ptr == 0) { intag=1; @@ -457,28 +516,46 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } /* Check is the file is a .js file */ else if ( - (strfield2(r->contenttype,"application/x-javascript")!=0) - || (strfield2(r->contenttype,"text/css")!=0) + (compare_mime(r->contenttype, str->url_file, "application/x-javascript")!=0) + || (compare_mime(r->contenttype, str->url_file, "text/css")!=0) ) { /* JavaScript js file */ - inscript=1; - inscript_name="script"; - intag=1; // because après