From 64cc4a88da8887ef1f7f4d90be0158d2cc76222d Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Mon, 19 Mar 2012 12:57:43 +0000 Subject: httrack 3.40.4 --- html/Makefile.am | 4 +- html/Makefile.in | 29 +- html/faq.html | 10 +- html/filters.html | 261 +++- html/httrack.man.html | 3995 ++++++++++++++++++++++++++----------------------- html/plug.html | 6 + 6 files changed, 2373 insertions(+), 1932 deletions(-) (limited to 'html') diff --git a/html/Makefile.am b/html/Makefile.am index a02e460..416dbf5 100755 --- a/html/Makefile.am +++ b/html/Makefile.am @@ -36,6 +36,8 @@ EXTRA_DIST = $(HelpHtml_DATA) $(HelpHtmlimg_DATA) $(HelpHtmlimages_DATA) \ install-data-hook: if test ! -f $(DESTDIR)$(prefix)/share/httrack/html ; then \ ( cd $(DESTDIR)$(prefix)/share/httrack \ - && $(LN_S) ../doc/httrack/html html \ + && mv -f ../doc/httrack/html html \ + && cd ../doc/httrack/ \ + && $(LN_S) ../../httrack/html html \ ) \ fi diff --git a/html/Makefile.in b/html/Makefile.in index 90a625e..fd5387c 100644 --- a/html/Makefile.in +++ b/html/Makefile.in @@ -1,7 +1,7 @@ -# Makefile.in generated by automake 1.7 from Makefile.am. +# Makefile.in generated by automake 1.7.9 from Makefile.am. # @configure_input@ -# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002 +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 # Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, @@ -40,6 +40,7 @@ ACLOCAL = @ACLOCAL@ AMDEP_FALSE = @AMDEP_FALSE@ AMDEP_TRUE = @AMDEP_TRUE@ AMTAR = @AMTAR@ +AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ @@ -50,6 +51,7 @@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ +CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ @@ -63,6 +65,8 @@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ @@ -96,8 +100,10 @@ THREADS_LIBS = @THREADS_LIBS@ V6_FLAG = @V6_FLAG@ VERSION = @VERSION@ VERSION_INFO = @VERSION_INFO@ +ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ ac_ct_RANLIB = @ac_ct_RANLIB@ ac_ct_STRIP = @ac_ct_STRIP@ am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ @@ -105,6 +111,7 @@ am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ am__include = @am__include@ +am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ bindir = @bindir@ build = @build@ @@ -169,6 +176,7 @@ EXTRA_DIST = $(HelpHtml_DATA) $(HelpHtmlimg_DATA) $(HelpHtmlimages_DATA) \ httrack.css subdir = html +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs CONFIG_HEADER = $(top_builddir)/config.h CONFIG_CLEAN_FILES = @@ -178,7 +186,7 @@ DATA = $(HelpHtml_DATA) $(HelpHtmlTxt_DATA) $(HelpHtmldiv_DATA) \ $(VFolderEntry_DATA) $(WebGPixmap_DATA) $(WebHtml_DATA) \ $(WebHtmlimages_DATA) $(WebHtmlsfx_DATA) $(WebPixmap_DATA) -DIST_COMMON = Makefile.am Makefile.in +DIST_COMMON = $(srcdir)/Makefile.in Makefile.am all: all-am .SUFFIXES: @@ -427,9 +435,11 @@ distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) distdir: $(DISTFILES) $(mkinstalldirs) $(distdir)/div $(distdir)/images $(distdir)/img $(distdir)/server $(distdir)/server/div $(distdir)/server/images $(distdir)/server/sfx @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ list='$(DISTFILES)'; for file in $$list; do \ case $$file in \ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ esac; \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ @@ -456,7 +466,6 @@ all-am: Makefile $(DATA) installdirs: $(mkinstalldirs) $(DESTDIR)$(HelpHtmldir) $(DESTDIR)$(HelpHtmlTxtdir) $(DESTDIR)$(HelpHtmldivdir) $(DESTDIR)$(HelpHtmlimagesdir) $(DESTDIR)$(HelpHtmlimgdir) $(DESTDIR)$(HelpHtmlrootdir) $(DESTDIR)$(VFolderEntrydir) $(DESTDIR)$(WebGPixmapdir) $(DESTDIR)$(WebHtmldir) $(DESTDIR)$(WebHtmlimagesdir) $(DESTDIR)$(WebHtmlsfxdir) $(DESTDIR)$(WebPixmapdir) - install: install-am install-exec: install-exec-am install-data: install-data-am @@ -468,7 +477,7 @@ install-am: all-am installcheck: installcheck-am install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - INSTALL_STRIP_FLAG=-s \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: @@ -476,7 +485,7 @@ mostlyclean-generic: clean-generic: distclean-generic: - -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f $(CONFIG_CLEAN_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @@ -486,7 +495,7 @@ clean: clean-am clean-am: clean-generic clean-libtool mostlyclean-am distclean: distclean-am - + -rm -f Makefile distclean-am: clean-am distclean-generic distclean-libtool dvi: dvi-am @@ -515,7 +524,7 @@ install-man: installcheck-am: maintainer-clean: maintainer-clean-am - + -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am @@ -563,7 +572,9 @@ uninstall-am: uninstall-HelpHtmlDATA uninstall-HelpHtmlTxtDATA \ install-data-hook: if test ! -f $(DESTDIR)$(prefix)/share/httrack/html ; then \ ( cd $(DESTDIR)$(prefix)/share/httrack \ - && $(LN_S) ../doc/httrack/html html \ + && mv -f ../doc/httrack/html html \ + && cd ../doc/httrack/ \ + && $(LN_S) ../../httrack/html html \ ) \ fi # Tell versions [3.59,3.63) of GNU make to not export all variables. diff --git a/html/faq.html b/html/faq.html index c5146a9..9225147 100644 --- a/html/faq.html +++ b/html/faq.html @@ -380,7 +380,15 @@ A: Yes, it does A: WinHTTrack is the Windows release of HTTrack (with a graphic shell)

Q: Is HTTrack Mac compatible?
-A: No, because of a lack of time. But sources are available +A: Yes, using the original sources: + +
+tar xvfz httrack-*.tar.gz
+cd httrack-*
+./configure --prefix=/usr/local && make && make install
+/usr/local/bin/httrack
+
+But there are no official binaries, because of a lack of time


Q: Can HTTrack be compiled on all Un*x?
A: It should. The Makefile may be modified in some cases, however diff --git a/html/filters.html b/html/filters.html index d058296..dac8545 100644 --- a/html/filters.html +++ b/html/filters.html @@ -119,7 +119,10 @@ See also: The FAQ
contrary refuse files of a particular type. That is the purpose of filters.
-
+

+

Scan rules based on URL or extension (e.g. accept or refuse all .zip or .gif files)

+

+ To accept a family of links (for example, all links with a specific name or type), you just have to add an authorization filter, like +*.gif. The pattern is a plus (this one: +), followed by a pattern composed of letters and wildcards (this one: *). @@ -131,23 +134,71 @@ See also: The FAQ
Example: +*.gif will accept all files finished by .gif
Example: -*.gif will refuse all files finished by .gif

+ +

+

Scan rules based on size (e.g. accept or refuse files bigger/smaller than a certain size)

+

+ + Once a link is scheduled for download, you can still refuse it (i.e. abort the download) by checking its + size to ensure that you won't reach a defined limit. + + Example: You may want to accept all files on the domain www.example.com, using '+www.example.com/*', + including gif files inside this domain and outside (eternal images), but not take to large images, + or too small ones (thumbnails)
+ Excluding gif images smaller than 5KB and images larger than 100KB is therefore a good option; + +www.example.com +*.gif -*.gif*[<5] -*.gif*[>100] + +
+ + Important notice: size scan rules are checked after the link was scheduled for download, + allowing to abort the connection. + + +

+

Scan rules based on MIME types (e.g. accept or refuse all files of type audio/mp3)

+

+ + Once a link is scheduled for download, you can still refuse it (i.e. abort the download) by matching its MIME + type against certain patterns. + + Example: You may want to accept all files on the domain www.example.com, using '+www.example.com/*', and + exclude all gif files, using '-*.gif'. But some dynamic scripts (such as www.example.com/dynamic.php) can + both generate html content, or image data content, depending on the context. Excluding this script, using + the scan rule '-www.example.com/dynamic.php', is therefore not a good solution. + +
+ The only reliable way in such cases is to exclude the specific mime type 'image/gif', using the scan rule + syntax:
+ -mime:image/gif +
+ + Important notice: MIME types scan rules are only checked against links that were + scheduled for download, i.e. links already authorized by url scan rules. + Hence, using '+mime:image/gif' will only be a hint to accept images that were already authorized, + if previous MIME scan rules excluded them - such as in '-mime:*/* +mime:text/html +mime:image/gif' + +

- Let's talk a little more about patterns: +

Scan rules patterns:

+ +

+

1.a. Scan rules based on URL or extension

+


Filters are analyzed by HTTrack from the first filter to the last one. The complete URL name is compared to filters defined by the user or added automatically by HTTrack.

- A link has an higher priority than the one before it - hierarchy is important:
+ A scan rule has an higher priority is it is declared later - hierarchy is important:

- -
+
+*.gif -image*.gif Will accept all gif files BUT image1.gif,imageblue.gif,imagery.gif and so on
+
-image*.gif +*.gif Will accept all gif files, because the second pattern is prioritary (because it is defined AFTER the first one) @@ -155,6 +206,8 @@ See also: The FAQ

+ Note: these scan rules can be mixed with scan rules based on size (see 1.b)
+
We saw that patterns are composed of letters and wildcards (*), as in */image*.gif @@ -162,47 +215,44 @@ See also: The FAQ
Special wild cards can be used for specific characters: (*[..])

- + - + - + - + - + - + - - + + - - + + - - + + - - + +
** any characters (the most commonly used)
*[file] or *[name]*[file] or *[name] any filename or name, e.g. not /,? and ; characters
*[path]*[path] any path (and filename), e.g. not ? and ; characters
*[a,z,e,r,t,y]*[a,z,e,r,t,y] any letters among a,z,e,r,t,y
*[a-z]*[a-z] any letters
*[0-9,a,z,e,r,t,y]*[0-9,a,z,e,r,t,y] any characters among 0..9 and a,z,e,r,t,y
*[]no characters must be present after*[\*]the * character
*[<NN]the file size must be smaller than NN KB -
(note: this may cause broken files during the download)
*[\\]the \ character
*[>NN]the file size must be greater than NN KB -
(note: this may cause broken files during the download)
*[\[\]]the [ or ] character
*[<NN>MM]the file size must be smaller than NN KB and greater than MM KB -
(note: this may cause broken files during the download)
*[]no characters must be present after
@@ -212,44 +262,195 @@ See also: The FAQ
interface)

- + - + - + - + - + - + - + - +
www.thisweb.com* www.thisweb.com* This will refuse/accept this web site (all links located in it will be rejected)
*.com/**.com/* This will refuse/accept all links that contains .com in them
*cgi-bin* *cgi-bin* This will refuse/accept all links that contains cgi-bin in them
www.*[path].com/*[path].zip www.*[path].com/*[path].zip This will refuse/accept all zip files in .com addresses
*someweb*/*.tar**someweb*/*.tar* This will refuse/accept all tar (or tar.gz etc.) files in hosts containing someweb
*/*somepage**/*somepage* This will refuse/accept all links containing somepage (but not in the address)
*.html*.html This will refuse/accept all html files.
Warning! With this filter you will accept ALL html files, even those in other addresses. (causing a global (!) web mirror..) Use www.someweb.com/*.html to accept all html files from a web.
*.html*[]*.html*[] Identical to *.html, but the link must not have any supplemental characters at the end (links with parameters, like www.someweb.com/index.html?page=10, will be refused)
+

+

1.b. Scan rules based on size

+

+ +
+ Filters are analyzed by HTTrack from the first filter to the last one. The sizes + are compared against scan rules defined by the user.

+ A scan rule has an higher priority is it is declared later - hierarchy is important.
+ + Note: scan rules based on size can be mixed with regular URL patterns
+ +


+ Size patterns:

+ + + + + + + + + + + + + +
*[<NN]the file size must be smaller than NN KB +
(note: this may cause broken files during the download)
*[>NN]the file size must be greater than NN KB +
(note: this may cause broken files during the download)
*[<NN>MM]the file size must be smaller than NN KB and greater than MM KB +
(note: this may cause broken files during the download)
+ +


+ Here are some examples of filters: (that can be generated automatically using the + interface)

+ + + + + + + + + + + + + + + + + +
-*[<10]the file will be forbidden if its size is smaller than 10 KB
-*[>50]the file will be forbidden if its size is greater than 50 KB
-*[<10] -*[>50]the file will be forbidden if if its size is smaller than 10 KB or greater than 50 KB
+*[<80>1]the file will be accepted if if its size is smaller than 80 KB and greater than 1 KB
+ + +

+

2. Scan rules based on MIME types

+

+ +
+ Filters are analyzed by HTTrack from the first filter to the last one. The complete MIME + type is compared against scan rules defined by the user.

+ A scan rule has an higher priority is it is declared later - hierarchy is important
+ + Note: scan rules based on MIME types can NOT be mixed with regular URL patterns or size patterns within the same rule, but you can use both of them in distinct ones
+ +


+ Here are some examples of filters: (that can be generated automatically using the + interface)

+ + + + + + + + + + + + + + + + + + + + + + + + + +
-mime:application/octet-streamThis will refuse all links of type 'application/octet-stream' that were already scheduled for download + (i.e. the download will be aborted)
-mime:application/*This will refuse all links of type begining with 'application/' that were already scheduled for download + (i.e. the download will be aborted)
-mime:application/* +mime:application/pdfThis will refuse all links of type begining with 'application/' that were already scheduled for download, except for 'application/pdf' ones + (i.e. all other 'application/' link download will be aborted)
-mime:video/*This will refuse all video links that were already scheduled for download + (i.e. all other 'application/' link download will be aborted)
-mime:video/* -mime:audio/*This will refuse all audio and video links that were already scheduled for download + (i.e. all other 'application/' link download will be aborted)
-mime:*/* +mime:text/html +mime:image/*This will refuse all links that were already scheduled for download, except html pages, and images + (i.e. all other link download will be aborted). Note that this is a very unefficient way of filtering + files, as aborted downloads will generate useless requests to the server. You are strongly advised to + use additional URL scan rules
+ +

+

2. Scan rules based on URL or size, and scan rules based on MIME types interactions

+

+ + You must use scan rules based on MIME types very carefully, or you will end up with an imcomplete + mirror, or create an unefficient download session (generating costly and useless requests to the server) +
+ +


+ Here are some examples of good/bad scan rules interactions:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PurposeMethodResult
Download all html and images on www.example.com-*
+www.example.com/*.html
+www.example.com/*.php
+www.example.com/*.asp
+www.example.com/*.gif
+www.example.com/*.jpg
+www.example.com/*.png
-mime:*/* +mime:text/html +mime:image/*
Good: efficient download
-*
+www.example.com/*
-mime:*/* +mime:text/html +mime:image/*
Bad: many aborted downloads, leading to poor performances and server load
Download only html on www.example.com, plus ZIP files-*
+www.example.com/*.html
+www.example.com/somedynamicscript.php
+www.example.com/*.zip
-mime:* +mime:text/html +mime:application/zip
Good: ZIP files will be downloaded, even those generated by 'somedynamicscript.php'
-*
+www.example.com/*.html
-mime:* +mime:text/html +mime:application/zip
Bad: ZIP files will never be scheduled for download, and hence the zip mime scan rule will never be used
Download all html, and images smaller than 100KB on www.example.com-*
+www.example.com/*.html
+www.example.com/*.php
+www.example.com/*.asp
+www.example.com/*.gif*[<100]
+www.example.com/*.jpg*[<100]
+www.example.com/*.png*[<100]
-mime:*/* +mime:text/html +mime:image/*
Good: efficient download
-*
+www.example.com/**[<100]
-mime:*/* +mime:text/html +mime:image/*
Bad: many aborted downloads, leading to poor performances and server load
+
diff --git a/html/httrack.man.html b/html/httrack.man.html index 33701d1..116cbb5 100644 --- a/html/httrack.man.html +++ b/html/httrack.man.html @@ -1,3 +1,5 @@ + + @@ -23,24 +25,28 @@ SEE ALSO

- -

NAME

+ -
-httrack - offline browser : copy websites to a local directory
+ + +

httrack − offline browser : copy websites to a +local directory

+ +

SYNOPSIS

- + - +
-httrack [ url ]... [ -filter ]... [ +filter ]... [ ] -[ -w, --mirror ] [ -W, --mirror-wizard ] [ + +

httrack [ url ]... [ -filter ]... [ +filter ]... [ +] [ -w, --mirror ] [ -W, --mirror-wizard ] [ -g, --get-files ] [ -i, --continue ] [ -Y, --mirrorlinks ] [ -P, --proxy ] [ -%f, --httpproxy-ftp[=N] ] [ -%b, --bind ] [ -rN, @@ -48,13 +54,14 @@ httrack - offline browser : copy websites to a local directory

--max-files[=N] ] [ -MN, --max-size[=N] ] [ -EN, --max-time[=N] ] [ -AN, --max-rate[=N] ] [ -%cN, --connection-per-second[=N] ] [ -GN, ---max-pause[=N] ] [ -cN, --sockets[=N] ] [ --TN, --timeout ] [ -RN, --retries[=N] ] [ --JN, --min-rate[=N] ] [ -HN, ---host-control[=N] ] [ -%P, +--max-pause[=N] ] [ -%mN, --max-mms-time[=N] ] [ +-cN, --sockets[=N] ] [ -TN, --timeout ] [ +-RN, --retries[=N] ] [ -JN, --min-rate[=N] ] [ +-HN, --host-control[=N] ] [ -%P, --extended-parsing[=N] ] [ -n, --near ] [ -t, --test ] [ -%L, --list ] [ -%S, --urllist -] [ -NN, --structure[=N] ] [ -%M, --mime-html +] [ -NN, --structure[=N] ] [ -%D, +--cached-delayed-type-check ] [ -%M, --mime-html ] [ -LN, --long-names[=N] ] [ -KN, --keep-links[=N] ] [ -x, --replace-external ] [ -%x, --disable-passwords ] [ -%q, @@ -84,1916 +91,1948 @@ httrack - offline browser : copy websites to a local directory --debug-headers ] [ -%!, --disable-security-limits ] [ -V, --userdef-cmd ] [ -%U, --user ] [ -%W, --callback ] [ -K, ---keep-links[=N] ] [ +--keep-links[=N] ] [

+ +

DESCRIPTION

- - - -
-httrack allows you to download a World Wide Web site -from the Internet to a local directory, building recursively -all directories, getting HTML, images, and other files from -the server to your computer. HTTrack arranges the original -site's relative link-structure. Simply open a page of the -"mirrored" website in your browser, and you can -browse the site from link to link, as if you were viewing it -online. HTTrack can also update an existing mirrored site, -and resume interrupted downloads.
+ + + + + +
+

httrack allows you to download a World Wide Web +site from the Internet to a local directory, building +recursively all directories, getting HTML, images, and other +files from the server to your computer. HTTrack arranges the +original site’s relative link-structure. Simply open a +page of the "mirrored" website in your browser, +and you can browse the site from link to link, as if you +were viewing it online. HTTrack can also update an existing +mirrored site, and resume interrupted downloads.

+

EXAMPLES

- + -
-httrack www.someweb.com/bob/
- + + +

httrack www.someweb.com/bob/

+ + -
-mirror site www.someweb.com/bob/ and only this -site
- + + +

mirror site www.someweb.com/bob/ and only this site

+ + + -
-httrack www.someweb.com/bob/ www.anothertest.com/mike/ -+*.com/*.jpg -mime:application/*
- + + +

httrack www.someweb.com/bob/ www.anothertest.com/mike/ ++*.com/*.jpg -mime:application/*

+ + -
-mirror the two sites together (with shared links) and accept -any .jpg files on .com sites
- + + +

mirror the two sites together (with shared links) and +accept any .jpg files on .com sites

+ + + -
-httrack www.someweb.com/bob/bobby.html +* --r6
- + + +

httrack www.someweb.com/bob/bobby.html +* +-r6

+ + - +
-means get all files starting from bobby.html, with 6 + +

means get all files starting from bobby.html, with 6 link-depth, and possibility of going everywhere on the -web

- +web

+ + + -
-httrack www.someweb.com/bob/bobby.html --spider -P -proxy.myhost.com:8080
- + + +

httrack www.someweb.com/bob/bobby.html --spider -P +proxy.myhost.com:8080

+ + -
-runs the spider on www.someweb.com/bob/bobby.html using a -proxy
- + + +

runs the spider on www.someweb.com/bob/bobby.html using a +proxy

+ + + -
-httrack --update
- + + +

httrack --update

+ + -
-updates a mirror in the current folder
- + + +

updates a mirror in the current folder

+ + + -
-httrack
- + + +

httrack

+ + -
-will bring you to the interactive mode
- + + +

will bring you to the interactive mode

+ + + -
-httrack --continue
- + + +

httrack --continue

+ + -
-continues a mirror in the current folder
+ + +

continues a mirror in the current folder

+ +

OPTIONS

- + -
-General options:
- + + +

General options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--O
+ + +

-O

+ + + + +

path for mirror/logfiles+cache (-O path mirror[,path +cache and logfiles]) (--path <param>)

+ + + -
-path for mirror/logfiles+cache (-O path mirror[,path cache -and logfiles]) (--path <param>)
- + + +

Action options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-Action options:
+ + - +

-w

+ + + -
+ +

*mirror web sites (--mirror)

+
--w
+ + - +

-W

+ + + -
+ +

mirror web sites, semi-automatic (asks questions) +(--mirror-wizard)

+
-*mirror web sites (--mirror)
+ + - +

-g

+ + + -
+ +

just get files (saved in the current directory) +(--get-files)

+
--W
+ + - +

-i

+ + + -
+ +

continue an interrupted mirror using the cache +(--continue)

+
-mirror web sites, semi-automatic (asks questions) -(--mirror-wizard)
+ + + +

-Y

+ + + +

mirror ALL links located in the first level pages +(mirror links) (--mirrorlinks)

+ + + -
--g
- + + +

Proxy options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-just get files (saved in the current directory) -(--get-files)
+ + - +

-P

+ + + -
+ +

proxy use (-P proxy:port or -P user:pass@proxy:port) +(--proxy <param>)

+
--i
+ + - +

-%f

+ + + -
+ +

*use proxy for ftp (f0 don t use) +(--httpproxy-ftp[=N])

+
-continue an interrupted mirror using the cache -(--continue)
+ + +

-%b

+ + + + +

use this local hostname to make/send requests (-%b +hostname) (--bind <param>)

+ + + -
--Y
- + + +

Limits options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-mirror ALL links located in the first level pages (mirror -links) (--mirrorlinks)
+ + - +

-rN

+ + + -
+ +

set the mirror depth to N (* r9999) (--depth[=N])

+
-Proxy options:
+ + - +

-%eN

+ + + -
+ +

set the external links depth to N (* %e0) +(--ext-depth[=N])

+
--P
+ + - +

-mN

+ + + -
+ +

maximum file length for a non-html file +(--max-files[=N])

+
-proxy use (-P proxy:port or -P user:pass@proxy:port) -(--proxy <param>)
+ + - +

-mN,N2

+ + + -
+ +

maximum file length for non html (N) and html (N2)

+
--%f
+ + - +

-MN

+ + + -
+ +

maximum overall size that can be uploaded/scanned +(--max-size[=N])

+
-*use proxy for ftp (f0 don t use) -(--httpproxy-ftp[=N])
+ + - +

-EN

+ + + -
+ +

maximum mirror time in seconds (60=1 minute, 3600=1 +hour) (--max-time[=N])

+
--%b
+ + - +

-AN

+ + + -
+ +

maximum transfer rate in bytes/seconds (1000=1KB/s max) +(--max-rate[=N])

+
-use this local hostname to make/send requests (-%b hostname) -(--bind <param>)
+ + - +

-%cN

+ + + -
+ +

maximum number of connections/seconds (*%c10) +(--connection-per-second[=N])

+
-Limits options:
+ + - +

-GN

+ + + -
+ +

pause transfer if N bytes reached, and wait until lock +file is deleted (--max-pause[=N])

+
--rN
+ + +

-%mN

+ + + + +

maximum mms stream download time in seconds (60=1 +minute, 3600=1 hour) (--max-mms-time[=N])

+ + + -
-set the mirror depth to N (* r9999) -(--depth[=N])
- + + +

Flow control:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--%eN
+ + - +

-cN

+ + + -
+ +

number of multiple connections (*c8) (--sockets[=N])

+
-set the external links depth to N (* %e0) -(--ext-depth[=N])
+ + - +

-TN

+ + + -
+ +

timeout, number of seconds after a non-responding link +is shutdown (--timeout)

+
--mN
+ + - +

-RN

+ + + -
+ +

number of retries, in case of timeout or non-fatal +errors (*R1) (--retries[=N])

+
-maximum file length for a non-html file -(--max-files[=N])
+ + - +

-JN

+ + + -
+ +

traffic jam control, minimum transfert rate +(bytes/seconds) tolerated for a link (--min-rate[=N])

+
--mN,N2
+ + + +

-HN

+ + + +

host is abandonned if: 0=never, 1=timeout, 2=slow, +3=timeout or slow (--host-control[=N])

+ + + -
-maximum file length for non html (N) and html -(N2)
- + + +

Links options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--MN
+ + - +

-%P

+ + + -
+ +

*extended parsing, attempt to parse all links, even in +unknown tags or Javascript (%P0 don t use) +(--extended-parsing[=N])

+
-maximum overall size that can be uploaded/scanned -(--max-size[=N])
+ + - +

-n

+ + + -
+ +

get non-html files near an html file (ex: an image +located outside) (--near)

+
--EN
+ + - +

-t

+ + + -
+ +

test all URLs (even forbidden ones) (--test)

+
-maximum mirror time in seconds (60=1 minute, 3600=1 hour) -(--max-time[=N])
+ + - +

-%L

+ + + -
+ +

<file> add all URL located in this text file (one +URL per line) (--list <param>)

+
--AN
+ + + +

-%S

+ + + +

<file> add all scan rules located in this text +file (one scan rule per line) (--urllist <param>)

+ + + -
-maximum transfer rate in bytes/seconds (1000=1KB/s max) -(--max-rate[=N])
- + + +

Build options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--%cN
+ + - +

-NN

+ + + -
+ +

structure type (0 *original structure, 1+: see below) +(--structure[=N])

+
-maximum number of connections/seconds (*%c10) -(--connection-per-second[=N])
+ + - +

-or

+ + + -
+ +

user defined structure (-N "%h%p/%n%q.%t")

+
--GN
+ + - +

-%N

+ + + -
+ +

delayed type check, don t make any link test but wait +for files download to start instead (experimental) (%N0 don +t use, %N1 use for unknown extensions, * %N2 always use)

+
-pause transfer if N bytes reached, and wait until lock file -is deleted (--max-pause[=N])
+ + - +

-%D

+ + + -
+ +

cached delayed type check, don t wait for remote type +during updates, to speedup them (%D0 wait, * %D1 don t wait) +(--cached-delayed-type-check)

+
-Flow control:
+ + - +

-%M

+ + + -
+ +

generate a RFC MIME-encapsulated full-archive (.mht) +(--mime-html)

+
--cN
+ + - +

-LN

+ + + -
+ +

long names (L1 *long names / L0 8-3 conversion / L2 +ISO9660 compatible) (--long-names[=N])

+
-number of multiple connections (*c8) -(--sockets[=N])
+ + - +

-KN

+ + + -
+ +

keep original links (e.g. http://www.adr/link) (K0 +*relative link, K absolute links, K4 original links, K3 +absolute URI links) (--keep-links[=N])

+
--TN
+ + - +

-x

+ + + -
+ +

replace external html links by error pages +(--replace-external)

+
-timeout, number of seconds after a non-responding link is -shutdown (--timeout)
+ + - +

-%x

+ + + -
+ +

do not include any password for external password +protected websites (%x0 include) (--disable-passwords)

+
--RN
+ + - +

-%q

+ + + -
+ +

*include query string for local files (useless, for +information purpose only) (%q0 don t include) +(--include-query-string)

+
-number of retries, in case of timeout or non-fatal errors -(*R1) (--retries[=N])
+ + - +

-o

+ + + -
+ +

*generate output html file in case of error (404..) (o0 +don t generate) (--generate-errors)

+
--JN
+ + - +

-X

+ + + -
+ +

*purge old files after update (X0 keep delete) +(--purge-old[=N])

+
-traffic jam control, minimum transfert rate (bytes/seconds) -tolerated for a link (--min-rate[=N])
+ + + +

-%p

+ + + +

preserve html files as is (identical to -K4 -%F +"" ) (--preserve)

+ + + -
--HN
- + + +

Spider options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-host is abandonned if: 0=never, 1=timeout, 2=slow, 3=timeout -or slow (--host-control[=N])
+ + - +

-bN

+ + + -
+ +

accept cookies in cookies.txt (0=do not accept,* +1=accept) (--cookies[=N])

+
-Links options:
+ + - +

-u

+ + + -
+ +

check document type if unknown (cgi,asp..) (u0 don t +check, * u1 check but /, u2 check always) +(--check-type[=N])

+
--%P
+ + - +

-j

+ + + -
+ +

*parse Java Classes (j0 don t parse) +(--parse-java[=N])

+
-*extended parsing, attempt to parse all links, even in -unknown tags or Javascript (%P0 don t use) -(--extended-parsing[=N])
+ + - +

-sN

+ + + -
+ +

follow robots.txt and meta robots tags +(0=never,1=sometimes,* 2=always, 3=always (even strict +rules)) (--robots[=N])

+
--n
+ + - +

-%h

+ + + -
+ +

force HTTP/1.0 requests (reduce update features, only +for old servers or proxies) (--http-10)

+
-get non-html files near an html file (ex: an image located -outside) (--near)
+ + - +

-%k

+ + + -
+ +

use keep-alive if possible, greately reducing latency +for small files and test requests (%k0 don t use) +(--keep-alive)

+
--t
+ + - +

-%B

+ + + -
+ +

tolerant requests (accept bogus responses on some +servers, but not standard!) (--tolerant)

+
-test all URLs (even forbidden ones) (--test)
+ + - +

-%s

+ + + -
+ +

update hacks: various hacks to limit re-transfers when +updating (identical size, bogus response..) +(--updatehack)

+
--%L
+ + - +

-%u

+ + + -
+ +

url hacks: various hacks to limit duplicate URLs (strip +//, www.foo.com==foo.com..) (--urlhack)

+
-<file> add all URL located in this text file (one URL -per line) (--list <param>)
+ + - +

-%A

+ + + -
+ +

assume that a type (cgi,asp..) is always linked with a +mime type (-%A php3,cgi=text/html;dat,bin=application/x-zip) +(--assume <param>)

+
--%S
+ + - +

-can

+ + + -
+ +

also be used to force a specific file type: --assume +foo.cgi=text/html

+
-<file> add all scan rules located in this text file -(one scan rule per line) (--urllist -<param>)
+ + +

-@iN

+ + + + +

internet protocol (0=both ipv6+ipv4, 4=ipv4 only, 6=ipv6 +only) (--protocol[=N])

+ + + -
-Build options:
- + + +

Browser ID:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--NN
+ + - +

-F

+ + + -
+ +

user-agent field sent in HTTP headers (-F +"user-agent name") (--user-agent +<param>)

+
-structure type (0 *original structure, 1+: see below) -(--structure[=N])
+ + - +

-%R

+ + + -
+ +

default referer field sent in HTTP headers (--referer +<param>)

+
--or
+ + - +

-%E

+ + + -
+ +

from email address sent in HTTP headers (--from +<param>)

+
-user defined structure (-N -"%h%p/%n%q.%t")
+ + - +

-%F

+ + + -
+ +

footer string in Html code (-%F "Mirrored [from +host %s [file %s [at %s]]]" (--footer +<param>)

+
--%N
+ + + +

-%l

+ + + +

preffered language (-%l "fr, en, jp, *" +(--language <param>)

+ + + -
-delayed type check, don t make any link test but wait for -files download to start instead (experimental) (%N0 don t -use, %N1 use for unknown extensions, * %N2 always -use)
- + + +

Log, index, cache

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--%M
+ + - +

-C

+ + + -
+ +

create/use a cache for updates and retries (C0 no +cache,C1 cache is prioritary,* C2 test update before) +(--cache[=N])

+
-generate a RFC MIME-encapsulated full-archive (.mht) -(--mime-html)
+ + - +

-k

+ + + -
+ +

store all files in cache (not useful if files on disk) +(--store-all-in-cache)

+
--LN
+ + - +

-%n

+ + + -
+ +

do not re-download locally erased files +(--do-not-recatch)

+
-long names (L1 *long names / L0 8-3 conversion / L2 ISO9660 -compatible) (--long-names[=N])
+ + - +

-%v

+ + + -
+ +

display on screen filenames downloaded (in realtime) - * +%v1 short version - %v2 full animation (--display)

+
--KN
+ + - +

-Q

+ + + -
+ +

no log - quiet mode (--do-not-log)

+
-keep original links (e.g. http://www.adr/link) (K0 *relative -link, K absolute links, K4 original links, K3 absolute URI -links) (--keep-links[=N])
+ + - +

-q

+ + + -
+ +

no questions - quiet mode (--quiet)

+
--x
+ + - +

-z

+ + + -
+ +

log - extra infos (--extra-log)

+
-replace external html links by error pages -(--replace-external)
+ + - +

-Z

+ + + -
+ +

log - debug (--debug-log)

+
--%x
+ + - +

-v

+ + + -
+ +

log on screen (--verbose)

+
-do not include any password for external password protected -websites (%x0 include) (--disable-passwords)
+ + - +

-f

+ + + -
+ +

*log in files (--file-log)

+
--%q
+ + - +

-f2

+ + + -
+ +

one single log file (--single-log)

+
-*include query string for local files (useless, for -information purpose only) (%q0 don t include) -(--include-query-string)
+ + - +

-I

+ + + -
+ +

*make an index (I0 don t make) (--index)

+
--o
+ + - +

-%i

+ + + -
+ +

make a top index for a project folder (* %i0 don t make) +(--build-top-index)

+
-*generate output html file in case of error (404..) (o0 don -t generate) (--generate-errors)
+ + +

-%I

+ + + + +

make an searchable index for this mirror (* %I0 don t +make) (--search-index)

+ + + -
--X
- + + +

Expert options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-*purge old files after update (X0 keep delete) -(--purge-old[=N])
+ + - +

-pN

+ + + -
+ +

priority mode: (* p3) (--priority[=N])

+
--%p
+ + - +

-p0

+ + + -
+ +

just scan, don t save anything (for checking links)

+
-preserve html files as is (identical to -K4 -%F "" -) (--preserve)
+ + - +

-p1

+ + + -
+ +

save only html files

+
-Spider options:
+ + - - -
--bN
- - - -
-accept cookies in cookies.txt (0=do not accept,* 1=accept) -(--cookies[=N])
- - - -
--u
- - - -
-check document type if unknown (cgi,asp..) (u0 don t check, -* u1 check but /, u2 check always) -(--check-type[=N])
- - - -
--j
- - - -
-*parse Java Classes (j0 don t parse) -(--parse-java[=N])
- - - -
--sN
- - - -
-follow robots.txt and meta robots tags -(0=never,1=sometimes,* 2=always, 3=always (even strict -rules)) (--robots[=N])
- - - -
--%h
- - - -
-force HTTP/1.0 requests (reduce update features, only for -old servers or proxies) (--http-10)
- - - -
--%k
- - - -
-use keep-alive if possible, greately reducing latency for -small files and test requests (%k0 don t use) -(--keep-alive)
- - - -
--%B
- - - -
-tolerant requests (accept bogus responses on some servers, -but not standard!) (--tolerant)
- - - -
--%s
- - - -
-update hacks: various hacks to limit re-transfers when -updating (identical size, bogus response..) -(--updatehack)
- - - -
--%u
- - - -
-url hacks: various hacks to limit duplicate URLs (strip //, -www.foo.com==foo.com..) (--urlhack)
- - - -
--%A
- - - -
-assume that a type (cgi,asp..) is always linked with a mime -type (-%A php3,cgi=text/html;dat,bin=application/x-zip) -(--assume <param>)
- - - -
--can
- - - -
-also be used to force a specific file type: --assume -foo.cgi=text/html
- - - -
--@iN
- - - -
-internet protocol (0=both ipv6+ipv4, 4=ipv4 only, 6=ipv6 -only) (--protocol[=N])
- - - -
-Browser ID:
- - - -
--F
- - - -
-user-agent field sent in HTTP headers (-F "user-agent -name") (--user-agent <param>)
- - - -
--%R
- - - -
-default referer field sent in HTTP headers (--referer -<param>)
- - - -
--%E
- - - -
-from email address sent in HTTP headers (--from -<param>)
- - - -
--%F
- - - -
-footer string in Html code (-%F "Mirrored [from host %s -[file %s [at %s]]]" (--footer -<param>)
- - - -
--%l
- - - -
-preffered language (-%l "fr, en, jp, *" -(--language <param>)
- - - -
-Log, index, cache
- - - -
--C
- - - -
-create/use a cache for updates and retries (C0 no cache,C1 -cache is prioritary,* C2 test update before) -(--cache[=N])
- - - -
--k
- - - -
-store all files in cache (not useful if files on disk) -(--store-all-in-cache)
- - - -
--%n
- - - -
-do not re-download locally erased files -(--do-not-recatch)
- - - -
--%v
- - - -
-display on screen filenames downloaded (in realtime) - * %v1 -short version - %v2 full animation (--display)
- - - -
--Q
- - - -
-no log - quiet mode (--do-not-log)
- - - -
--q
- - - -
-no questions - quiet mode (--quiet)
- - - -
--z
- - - -
-log - extra infos (--extra-log)
- - - -
--Z
- - - -
-log - debug (--debug-log)
- - - -
--v
- - - -
-log on screen (--verbose)
- - - -
--f
- - - -
-*log in files (--file-log)
- - - -
--f2
- - - -
-one single log file (--single-log)
- - - -
--I
- - - -
-*make an index (I0 don t make) (--index)
- - - -
--%i
- - - -
-make a top index for a project folder (* %i0 don t make) -(--build-top-index)
- - - -
--%I
- - - -
-make an searchable index for this mirror (* %I0 don t make) -(--search-index)
- - - -
-Expert options:
- - - -
--pN
- - - -
-priority mode: (* p3) (--priority[=N])
- - - -
--p0
- - - -
-just scan, don t save anything (for checking -links)
- - - -
--p1
+

-p2

+ + + - - -
-save only html files
- - - -
--p2
- - - -
-save only non html files
- - - -
--*p3
- - - -
-save all files
- - - -
--p7
- - - -
-get html files before, then treat other files
- - - -
--S
- - - -
-stay on the same directory (--stay-on-same-dir)
- - - -
--D
- - - -
-*can only go down into subdirs (--can-go-down)
- - - -
--U
- - - -
-can only go to upper directories (--can-go-up)
- - - -
--B
- - - -
-can both go up&down into the directory structure -(--can-go-up-and-down)
- - - -
--a
- - - -
-*stay on the same address -(--stay-on-same-address)
- - - -
--d
- - - -
-stay on the same principal domain -(--stay-on-same-domain)
- - - -
--l
- - - -
-stay on the same TLD (eg: .com) -(--stay-on-same-tld)
- - - -
--e
- - - -
-go everywhere on the web (--go-everywhere)
- - - -
--%H
- - - -
-debug HTTP headers in logfile (--debug-headers)
- - - -
-Guru options: (do NOT use if possible)
- - - -
--#X
- - - -
-*use optimized engine (limited memory boundary checks) -(--fast-engine)
- - - -
--#0
- - - -
-filter test (-#0 *.gif www.bar.com/foo.gif ) -(--debug-testfilters <param>)
- - - -
--#1
- - - -
-simplify test (-#1 ./foo/bar/../foobar)
- - - -
--#2
- - - -
-type test (-#2 /foo/bar.php)
- - - -
--#C
- - - -
-cache list (-#C *.com/spider*.gif (--debug-cache -<param>)
- - - -
--#R
- - - -
-cache repair (damaged cache) (--debug-oldftp)
- - - -
--#d
- - - -
-debug parser (--debug-parsing)
- - +

save only non html files

+ -
--#E
+ + - - -
-extract new.zip cache meta-data in meta.zip
+

-*p3

+ + + - +

save all files

+ -
--#f
+ + - - -
-always flush log files (--advanced-flushlogs)
+

-p7

+ + + - +

get html files before, then treat other files

+ -
--#FN
+ + - - -
-maximum number of filters -(--advanced-maxfilters[=N])
+

-S

+ + + - +

stay on the same directory (--stay-on-same-dir)

+ -
--#h
+ + - - -
-version info (--version)
+

-D

+ + + - +

*can only go down into subdirs (--can-go-down)

+ -
--#K
+ + - - -
-scan stdin (debug) (--debug-scanstdin)
+

-U

+ + + - +

can only go to upper directories (--can-go-up)

+ -
--#L
+ + - - -
-maximum number of links (-#L1000000) -(--advanced-maxlinks)
+

-B

+ + + - +

can both go up&down into the directory structure +(--can-go-up-and-down)

+ -
--#p
+ + - - -
-display ugly progress information -(--advanced-progressinfo)
+

-a

+ + + - +

*stay on the same address (--stay-on-same-address)

+ -
--#P
+ + - - -
-catch URL (--catch-url)
+

-d

+ + + - +

stay on the same principal domain +(--stay-on-same-domain)

+ -
--#R
+ + - - -
-old FTP routines (debug) (--debug-oldftp)
+

-l

+ + + - +

stay on the same TLD (eg: .com) (--stay-on-same-tld)

+ -
--#T
+ + - - -
-generate transfer ops. log every minutes -(--debug-xfrstats)
+

-e

+ + + - +

go everywhere on the web (--go-everywhere)

+ -
--#u
+ + - - -
-wait time (--advanced-wait)
+

-%H

+ + + +

debug HTTP headers in logfile (--debug-headers)

+ + + -
--#Z
- + + +

Guru options: (do NOT use if possible)

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-generate transfer rate statictics every minutes -(--debug-ratestats)
+ + - - -
--#!
+

-#X

+ + + - +

*use optimized engine (limited memory boundary checks) +(--fast-engine)

+ -
-execute a shell command (-#! "echo hello") (--exec -<param>)
+ + - - -
-Dangerous options: (do NOT use unless you exactly know -what you are doing)
+

-#0

+ + + - +

filter test (-#0 *.gif www.bar.com/foo.gif ) +(--debug-testfilters <param>)

+ -
--%!
+ + - - -
-bypass built-in security limits aimed to avoid bandwith -abuses (bandwidth, simultaneous connections) -(--disable-security-limits)
+

-#1

+ + + - +

simplify test (-#1 ./foo/bar/../foobar)

+ -
--IMPORTANT
+ + - - -
-NOTE: DANGEROUS OPTION, ONLY SUITABLE FOR -EXPERTS
+

-#2

+ + + - +

type test (-#2 /foo/bar.php)

+ -
--USE
+ + - - -
-IT WITH EXTREME CARE
+

-#C

+ + + - +

cache list (-#C *.com/spider*.gif (--debug-cache +<param>)

+ -
-Command-line specific options:
+ + - - -
--V
+

-#R

+ + + - +

cache repair (damaged cache) (--debug-oldftp)

+ -
-execute system command after each files ($0 is the filename: --V "rm ") (--userdef-cmd -<param>)
+ + - - -
--%U
+

-#d

+ + + - +

debug parser (--debug-parsing)

+ -
-run the engine with another id when called as root (-%U -smith) (--user <param>)
+ + - - -
--%W
+

-#E

+ + + - +

extract new.zip cache meta-data in meta.zip

+ -
-use an external library function as a wrapper (-%W -link-detected=foo.so:myfunction[,myparameters]) (--callback -<param>)
+ + - +

-#f

+ + + -
+ +

always flush log files (--advanced-flushlogs)

+
-Details: Option N
+ + - +

-#FN

+ + + -
+ +

maximum number of filters +(--advanced-maxfilters[=N])

+
--N0
+ + - +

-#h

+ + + -
+ +

version info (--version)

+
-Site-structure (default)
+ + - +

-#K

+ + + -
+ +

scan stdin (debug) (--debug-scanstdin)

+
--N1
+ + + +

-#L

+ + + +

maximum number of links (-#L1000000) +(--advanced-maxlinks)

+ + + + cols="4" cellspacing="0" cellpadding="0"> -
-HTML in web/, images/other files in web/images/
+ + - +

-#p

+ + + -
+ +

display ugly progress information +(--advanced-progressinfo)

+
--N2
+ + - +

-#P

+ + + -
+ +

catch URL (--catch-url)

+
-HTML in web/HTML, images/other in web/images
+ + - +

-#R

+ + + -
+ +

old FTP routines (debug) (--debug-oldftp)

+
--N3
+ + - +

-#T

+ + + -
+ +

generate transfer ops. log every minutes +(--debug-xfrstats)

+
-HTML in web/, images/other in web/
+ + - +

-#u

+ + + -
+ +

wait time (--advanced-wait)

+
--N4
+ + - +

-#Z

+ + + -
+ +

generate transfer rate statictics every minutes +(--debug-ratestats)

+
-HTML in web/, images/other in web/xxx, where xxx is the file -extension (all gif will be placed onto web/gif, for -example)
+ + + +

-#!

+ + + +

execute a shell command (-#! "echo hello") +(--exec <param>)

+ + + -
--N5
- + + +

Dangerous options: (do NOT use unless you exactly know +what you are doing)

+ + + cols="4" cellspacing="0" cellpadding="0"> -
-Images/other in web/xxx and HTML in web/HTML
+ + +

-%!

+ + + + +

bypass built-in security limits aimed to avoid bandwith +abuses (bandwidth, simultaneous connections) +(--disable-security-limits)

+ + + -
--N99
- + + +

-IMPORTANT

+ + -
-All files in web/, with random names (gadget !)
- + + +

NOTE: DANGEROUS OPTION, ONLY SUITABLE FOR EXPERTS

+ + + + cols="5" cellspacing="0" cellpadding="0"> -
--N100
+ + + +

-USE

+ + + +

IT WITH EXTREME CARE

+ + + + + -
-Site-structure, without www.domain.xxx/
- + + +

Command-line specific options:

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--N101
+ + - +

-V

+ + + -
+ +

execute system command after each files ($0 is the +filename: -V "rm ") (--userdef-cmd +<param>)

+
-Identical to N1 exept that "web" is replaced by -the site s name
+ + - +

-%U

+ + + -
+ +

run the engine with another id when called as root (-%U +smith) (--user <param>)

+
--N102
+ + +

-%W

+ + + + +

use an external library function as a wrapper (-%W +link-detected=foo.so:myfunction[,myparameters]) (--callback +<param>)

+ + + -
-Identical to N2 exept that "web" is replaced by -the site s name
- + + +

Details: Option N

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--N103
+ + - +

-N0

+ + + -
+ +

Site-structure (default)

+
-Identical to N3 exept that "web" is replaced by -the site s name
+ + - +

-N1

+ + + -
+ +

HTML in web/, images/other files in web/images/

+
--N104
+ + - +

-N2

+ + + -
+ +

HTML in web/HTML, images/other in web/images

+
-Identical to N4 exept that "web" is replaced by -the site s name
+ + - +

-N3

+ + + -
+ +

HTML in web/, images/other in web/

+
--N105
+ + - +

-N4

+ + + -
+ +

HTML in web/, images/other in web/xxx, where xxx is the +file extension (all gif will be placed onto web/gif, for +example)

+
-Identical to N5 exept that "web" is replaced by -the site s name
+ + - +

-N5

+ + + -
+ +

Images/other in web/xxx and HTML in web/HTML

+
--N199
+ + - +

-N99

+ + + -
+ +

All files in web/, with random names (gadget !)

+
-Identical to N99 exept that "web" is replaced by -the site s name
+ + - +

-N100

+ + + -
+ +

Site-structure, without www.domain.xxx/

+
--N1001
+ + - +

-N101

+ + + -
+ +

Identical to N1 exept that "web" is replaced +by the site s name

+
-Identical to N1 exept that there is no "web" -directory
+ + - +

-N102

+ + + -
+ +

Identical to N2 exept that "web" is replaced +by the site s name

+
--N1002
+ + - +

-N103

+ + + -
+ +

Identical to N3 exept that "web" is replaced +by the site s name

+
-Identical to N2 exept that there is no "web" -directory
+ + - +

-N104

+ + + -
+ +

Identical to N4 exept that "web" is replaced +by the site s name

+
--N1003
+ + - +

-N105

+ + + -
+ +

Identical to N5 exept that "web" is replaced +by the site s name

+
-Identical to N3 exept that there is no "web" -directory (option set for g option)
+ + - +

-N199

+ + + -
+ +

Identical to N99 exept that "web" is replaced +by the site s name

+
--N1004
+ + - +

-N1001

+ + + -
+ +

Identical to N1 exept that there is no "web" +directory

+
-Identical to N4 exept that there is no "web" -directory
+ + - +

-N1002

+ + + -
+ +

Identical to N2 exept that there is no "web" +directory

+
--N1005
+ + - +

-N1003

+ + + -
+ +

Identical to N3 exept that there is no "web" +directory (option set for g option)

+
-Identical to N5 exept that there is no "web" -directory
+ + - +

-N1004

+ + + -
+ +

Identical to N4 exept that there is no "web" +directory

+
--N1099
+ + - +

-N1005

+ + + -
+ +

Identical to N5 exept that there is no "web" +directory

+
-Identical to N99 exept that there is no "web" -directory
+ + + +

-N1099

+ + + +

Identical to N99 exept that there is no "web" +directory

+ + + -
-Details: User-defined option N
- + + +

Details: User-defined option N

+ + - +
-%n Name of file without file type (ex: image) %N Name of + +

%n Name of file without file type (ex: image) %N Name of file, including file type (ex: image.gif) %t File type (ex: gif) %p Path [without ending /] (ex: /someimages) %h Host name (ex: www.someweb.com) %M URL MD5 (128 bits, 32 ascii @@ -2001,662 +2040,836 @@ bytes) %Q query string MD5 (128 bits, 32 ascii bytes) %r protocol name (ex: http) %q small query string MD5 (16 bits, 4 ascii bytes) %s? Short name version (ex: %sN) %[param] param variable in query string -%[param:before:after:notfound:empty] advanced variable -extraction

- +%[param:before:after:empty:notfound] advanced variable +extraction

+ + + -
-Details: User-defined option N and advanced variable -extraction
- + + +

Details: User-defined option N and advanced variable +extraction

+ + -
-%[param:before:after:notfound:empty]
- + + +

%[param:before:after:empty:notfound]

+ + + + cols="5" cellspacing="0" cellpadding="0"> -
--param
+ + - - -
-: parameter name
+

-param

+ + + +

: parameter name

+ + + + + -
--before
- + + +

-before

+ + -
-: string to prepend if the parameter was found
- + + +

: string to prepend if the parameter was found

+ + + + cols="5" cellspacing="0" cellpadding="0"> -
--after
+ + - - -
-: string to append if the parameter was found
+

-after

+ + + +

: string to append if the parameter was found

+ + + + + -
--notfound
- + + +

-notfound

+ + -
-: string replacement if the parameter could not be -found
- + + +

: string replacement if the parameter could not be +found

+ + + + cols="4" cellspacing="0" cellpadding="0"> -
--empty
+ + - - -
-: string replacement if the parameter was empty
+

-empty

+ + + - +

: string replacement if the parameter was empty

+ -
--all
+ + - - -
-fields, except the first one (the parameter name), can be -empty
+

-all

+ + + +

fields, except the first one (the parameter name), can +be empty

+ + + -
-Details: Option K
- + + +

Details: Option K

+ + + cols="4" cellspacing="0" cellpadding="0"> -
--K0
+ + - - -
-foo.cgi?q=45 -> foo4B54.html?q=45 (relative URI, -default)
+

-K0

+ + + - +

foo.cgi?q=45 -> foo4B54.html?q=45 (relative URI, +default)

+ -
--K
+ + - - -
--> http://www.foobar.com/folder/foo.cgi?q=45 (absolute -URL) (--keep-links[=N])
+

-K

+ + + - +

-> http://www.foobar.com/folder/foo.cgi?q=45 +(absolute URL) (--keep-links[=N])

+ -
--K4
+ + - - -
--> foo.cgi?q=45 (original URL)
+

-K4

+ + + - +

-> foo.cgi?q=45 (original URL)

+ -
--K3
+ + - - -
--> /folder/foo.cgi?q=45 (absolute URI)
+

-K3

+ + + +

-> /folder/foo.cgi?q=45 (absolute URI)

+ + + -
-Shortcuts:
- + + +

Shortcuts:

+ + -
---mirror
- + + +

--mirror

+ + -
-<URLs> *make a mirror of site(s) -(default)
- + + +

<URLs> *make a mirror of site(s) (default)

+ + + + cols="4" cellspacing="0" cellpadding="0"> -
---get
+ + - +

--get

+ + + -
+ +

<URLs> get the files indicated, do not seek other +URLs (-qg)

+
-<URLs> get the files indicated, do not seek other URLs -(-qg)
+ + +

--list

+ + + + +

<text file> add all URL located in this text file +(-%L)

+ + + -
---list
- + + +

--mirrorlinks

+ + -
-<text file> add all URL located in this text file -(-%L)
- + + +

<URLs> mirror all links in 1st level pages (-Y)

+ + + -
---mirrorlinks
- + + +

--testlinks

+ + -
-<URLs> mirror all links in 1st level pages -(-Y)
- + + +

<URLs> test links in pages (-r1p0C0I0t)

+ + + -
---testlinks
- + + +

--spider

+ + -
-<URLs> test links in pages (-r1p0C0I0t)
- + + +

<URLs> spider site(s), to test links: reports +Errors & Warnings (-p0C0I0t)

+ + + -
---spider
- + + +

--testsite

+ + -
-<URLs> spider site(s), to test links: reports Errors -& Warnings (-p0C0I0t)
- + + +

<URLs> identical to --spider

+ + + -
---testsite
- + + +

--skeleton

+ + -
-<URLs> identical to --spider
- + + +

<URLs> make a mirror, but gets only html files +(-p1)

+ + + -
---skeleton
- + + +

--update

+ + -
-<URLs> make a mirror, but gets only html files -(-p1)
- + + +

update a mirror, without confirmation (-iC2)

+ + + -
---update
- + + +

--continue

+ + -
-update a mirror, without confirmation (-iC2)
- + + +

continue a mirror, without confirmation (-iC1)

+ + + -
---continue
- + + +

--catchurl

+ + -
-continue a mirror, without confirmation (-iC1)
- + + +

create a temporary proxy to capture an URL or a form post +URL

+ + + -
---catchurl
- + + +

--clean

+ + -
-create a temporary proxy to capture an URL or a form post -URL
- + + +

erase cache & log files

+ + + -
---clean
- + + +

--http10

+ + -
-erase cache & log files
- + + +

force http/1.0 requests (-%h)

+ + + -
---http10
- + + +

Details: Option %W: External callbacks +prototypes

+ + -
-force http/1.0 requests (-%h)
- + + +

init : void (* myfunction)(void);

+ + -
-Details: Option %W: External callbacks -prototypes
- + + +

free : void (* myfunction)(void);

+ + -
-init : void (* myfunction)(void);
- + + +

start : int (* myfunction)(httrackp* +opt);

+ + -
-free : void (* myfunction)(void);
- + + +

end : int (* myfunction)(void);

+ + -
-start : int (* myfunction)(httrackp* -opt);
- + + +

change-options : int (* myfunction)(httrackp* +opt);

+ + -
-end : int (* myfunction)(void);
- + + +

preprocess-html : int (* myfunction)(char** html,int* +len,char* url

+ + -
-change-options : int (* myfunction)(httrackp* -opt);
- + + +

adresse,char* url fichier);

+ + + -
-preprocess-html : int (* myfunction)(char** html,int* -len,char* url
- + + +

postprocess-html : int (* myfunction)(char** html,int* +len,char* url

+ + -
-adresse,char* url fichier);
- + + +

adresse,char* url fichier);

+ + + -
-postprocess-html : int (* myfunction)(char** html,int* -len,char* url
- + + +

check-html : int (* myfunction)(char* html,int +len,char* url

+ + -
-adresse,char* url fichier);
- + + +

adresse,char* url fichier);

+ + + -
-check-html : int (* myfunction)(char* html,int len,char* -url
- + + +

query : char* (* myfunction)(char* +question);

+ + -
-adresse,char* url fichier);
- + + +

query2 : char* (* myfunction)(char* +question);

+ + -
-query : char* (* myfunction)(char* -question);
- + + +

query3 : char* (* myfunction)(char* +question);

+ + -
-query2 : char* (* myfunction)(char* -question);
- + + +

loop : int (* myfunction)(lien

+ + -
-query3 : char* (* myfunction)(char* -question);
- + + +

back* back,int back max,int back index,int lien tot,int +lien ntot,int stat time,hts stat struct* stats);

+ + + -
-loop : int (* myfunction)(lien
- + + +

check-link : int (* myfunction)(char* adr,char* +fil,int status);

+ + -
-back* back,int back max,int back index,int lien tot,int lien -ntot,int stat time,hts stat struct* stats);
- + + +

pause : void (* myfunction)(char* +lockfile);

+ + -
-check-link : int (* myfunction)(char* adr,char* fil,int -status);
- + + +

save-file : void (* myfunction)(char* +file);

+ + -
-pause : void (* myfunction)(char* -lockfile);
- + + +

save-file2 : void (* myfunction)(char* hostname,char* +filename,char* localfile,int is

+ + -
-save-file : void (* myfunction)(char* -file);
- + + +

new,int is modified);

+ + + -
-link-detected : int (* myfunction)(char* -link);
- + + +

link-detected : int (* myfunction)(char* +link);

+ + -
-link-detected2 : int (* myfunction)(char* link, char* -start
- + + +

link-detected2 : int (* myfunction)(char* link, char* +start

+ + -
-tag);
- + + +

tag);

+ + + -
-transfer-status : int (* -myfunction)(lien
- + + +

transfer-status : int (* myfunction)(lien

+ + -
-back* back);
- + + +

back* back);

+ + + -
-save-name : int (* myfunction)(char* adr
- + + +

save-name : int (* myfunction)(char* adr

+ + -
-complete,char* fil complete,char* referer adr,char* referer -fil,char* save);
- + + +

complete,char* fil complete,char* referer adr,char* +referer fil,char* save);

+ + + -
-And <wrappername>
- + + +

And <wrappername>

+ + -
-init() functions if defined, called upon plug
+ + +

init() functions if defined, called upon plug

+ +

FILES

- + -
-/etc/httrack.conf
- + + +

/etc/httrack.conf

+ + -
-The system wide configuration file.
+ + +

The system wide configuration file.

+ +

ENVIRONMENT

- + + cols="4" cellspacing="0" cellpadding="0"> -
-HOME
+ + - - -
-Is being used if you defined in /etc/httrack.conf the line -path ~/websites/#
+

HOME

+ + + + +

Is being used if you defined in /etc/httrack.conf the +line path ~/websites/#

+ +

DIAGNOSTICS

- + - +
-Errors/Warnings are reported to hts-log.txt by + +

Errors/Warnings are reported to hts-log.txt by default, or to stderr if the -v option was -specified.

+specified.

+ +

LIMITS

- - - -
-These are the principals limits of HTTrack for that moment. -Note that we did not heard about any other utility that -would have solved them.
- + - +
-- Several scripts generating complex filenames may + +

These are the principals limits of HTTrack for that +moment. Note that we did not heard about any other utility +that would have solved them.

+ +

- Several scripts generating complex filenames may not find them (ex: -img.src='image'+a+Mobj.dst+'.gif')

- - - -
-- Some java classes may not find some files on them -(class included)
- - - -
-- Cgi-bin links may not work properly in some cases -(parameters needed). To avoid them: use filters like --*cgi-bin*
+img.src=’image’+a+Mobj.dst+’.gif’)

+ +

- Some java classes may not find some files on +them (class included)

+ +

- Cgi-bin links may not work properly in some +cases (parameters needed). To avoid them: use filters like +-*cgi-bin*

+ +

BUGS

- + - +
-Please reports bugs to <bugs@httrack.com>. + +

Please reports bugs to <bugs@httrack.com>. Include a complete, self-contained example that will allow the bug to be reproduced, and say which version of httrack you are using. Do not forget to detail options used, OS -version, and any other information you deem -necessary.

+version, and any other information you deem necessary.

+ +

COPYRIGHT

- - - -
-Copyright (C) Xavier Roche and other -contributors
- + - +
-This program is free software; you can redistribute it + +

Copyright (C) Xavier Roche and other contributors

+ +

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either -version 2 of the License, or any later version.

- - - -
-This program is distributed in the hope that it will be +version 2 of the License, or any later version.

+ +

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -details.

- - - -
-You should have received a copy of the GNU General Public +details.

+ +

You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA.

+Boston, MA 02111-1307, USA.

+ +

AVAILABILITY

- + -
-The most recent released version of httrack can be found at: -http://www.httrack.com
+ + +

The most recent released version of httrack can be found +at: http://www.httrack.com

+ +

AUTHOR

- + -
-Xavier Roche <roche@httrack.com>
+ + +

Xavier Roche <roche@httrack.com>

+ +

SEE ALSO

- + - +
-The HTML documentation (available online at + +

The HTML documentation (available online at http://www.httrack.com/html/ ) contains more detailed information. Please also refer to the httrack FAQ (available online at -http://www.httrack.com/html/faq.html )

+http://www.httrack.com/html/faq.html )

+ +
diff --git a/html/plug.html b/html/plug.html index 4c99798..42b0895 100755 --- a/html/plug.html +++ b/html/plug.html @@ -146,8 +146,14 @@ Below the list of callbacks, and associated external wrappers:
"query3"Called when the wizard needs to ask a questionchar* (* myfunction)(char* question); "loop"Called periodically (informational, to display statistics)
return value: 1 if the mirror can continue, 0 if the mirror must be abortedint (* myfunction)(lien_back* back,int back_max,int back_index,int lien_tot,int lien_ntot,int stat_time,hts_stat_struct* stats); "check-link"Called when a link has to be tested. The adr and fil are the address and URI of the link being tested. The passed status value has the following meaning: 0 if the link is to be accepted by default, 1 if the link is to be refused by default, and -1 if no decision has yet been taken by the engine
return value: same meaning as the passed status value ; you may generally return -1 to let the engine take the decision by itselfint (* myfunction)(char* adr,char* fil,int status); +"check-mime"Called when a link download has begun, and needs to be tested against its MIME type. The adr and fil are the address and URI of the link being tested, and the mime string contains the link type being processed. The passed status value has the following meaning: 0 if the link is to be accepted by default, 1 if the link is to be refused by default, and -1 if no decision has yet been taken by the engine
return value: same meaning as the passed status value ; you may generally return -1 to let the engine take the decision by itselfint (* myfunction)(char* adr,char* fil,char* mime,int status); "pause"Called when the engine must pause. When the lockfile passed is deleted, the function can return
return value: nonevoid (* myfunction)(char* lockfile); "save-file"Called when a file is to be saved on disk
return value: nonevoid (* myfunction)(char* file); +"save-file2"Called when a file is to be saved or checked on disk
The hostname, filename and local filename are given. Two additional flags tells if the file is new (is_new) and is the file is to be modified (is_modified).
(!is_new && !is_modified): the file is up-to-date, and will not be modified
(is_new && is_modified): a new file will be written (or an updated file is being written)
(!is_new && is_modified): a file is being updated (append)
(is_new && !is_modified): an empty file will be written ("do not recatch locally erased files")
return value: nonevoid (* myfunction)(char* hostname,char* filename,char* localfile,int is_new,int is_modified); + +typedef void (* t_hts_htmlcheck_filesave2)(); + + "link-detected"Called when a link has been detected
return value: 1 if the link can be analyzed, 0 if the link must not even be consideredint (* myfunction)(char* link); "transfer-status"Called when a file has been processed (downloaded, updated, or error)
return value: must return 1int (* myfunction)(lien_back* back); "save-name"Called when a local filename has to be processed. The adr_complete and fil_complete are the address and URI of the file being saved ; the referer_adr and referer_fil are the address and URI of the referer link. The save string contains the local filename being used. You may modifiy the save string to fit your needs, up to 1024 bytes (note: filename collisions, if any, will be handled by the engine by renaming the file into file-2.ext, file-3.ext ..).
return value: must return 1int (* myfunction)(char* adr_complete,char* fil_complete,char* referer_adr,char* referer_fil,char* save); -- cgit v1.2.3