# DIRECTIVES COMMON to HTTP and FILESYSTEM METHODS ################################################### # WINDOWS USERS NOTE: # Specify ALL files and directory paths in the # the config file using the forward slash, as # in /thisdirectory. # ################################################### IndexDir /w2/s/www.socalsail.com/html/cyberfair # For the FileSystem Method: # This is a space-separated list of files and # directories you want indexed. You can specify # more than one of these directives. # # For the HTTP Method: # Use the URL's from which you want the spidering # to begin. # NOTE: use hmtl files rather than directories # for this method. IndexFile /w2/s/www.socalsail.com/html/search/scs_index # This is what the generated index file will be. IndexName "SoCalSail search index" IndexDescription "This is an index of the SoCalSail site." IndexPointer "http://www.socalsail.com/search/" IndexAdmin "John Callender (jbc@west.net)" # Extra information you can include in the index file. MetaNames # List of all the meta names used in the file to index, must be on one line. # If no metanames DO NOT deleted the line. IndexReport 0 # This is how detailed you want reporting. You can specify numbers # 0 to 3 - 0 is totally silent, 3 is the most verbose. FollowSymLinks yes # Put "yes" to follow symbolic links in indexing, else "no". #UseStemming no # Put yes to apply word stemming algorithm during indexing, # else no. See the manual for info about stemming. Default is # no. #PropertyNames author # List of meta tags names that can be retrieved with the -p option. # Index size increases as by the formula in the manual. # Comment out if no PropertyNames. Case insensitive IgnoreTotalWordCountWhenRanking yes # Put yes to ignore the total number of words in the file # when calculating ranking. Often better with merges and # small files. Default is no. ReplaceRules replace "/w2/s/www.socalsail.com/html/" "/" #ReplaceRules remove "ghill/" #ReplaceRules replace "[a-z_0-9]*_m.*\.html" "index.html" #ReplaceRules replace "/ghill" "moreghillmore" # ReplaceRules allow you to make changes to file pathnames # before they're indexed. This directive uses C library # regex.h regular expressions. # NOTE: do not use replace "" to remove a string, # use remove instead - you might get a core dump otherwise. #MinWordLimit 5 # Set the minimum length of an indexable word. Every shorter word # will not be indexed. # Commenting out the line will give the defaults #MaxWordLimit 5 # Set the maximum length of an indexable word. Every longer word # will not be indexed. # Commenting out the line will give the defaults #WordCharacters abcdefghijklmnopqrstuvwxyz\&#;0123456789.@|,-'"[](~!@$%^{}_+? # WORDCHARS is a string of characters which SWISH permits to # be in words. Any strings which do not include these characters # will not be indexed. You can choose from any character in # the following string: # # abcdefghijklmnopqrstuvwxyz0123456789_\|/-+=?!@$%^'"`~,.[]{}() # # Note that if you omit "0123456789&#;" you will not be able to # index HTML entities. DO NOT use the asterisk (*), lesser than # and greater than signs (<), (>), or colon (:). # # Including any of these four characters may cause funny things to happen. # NOTE: Do not escape \ nor " and they cannot be the first letter in the string # Commenting out the line will give the defaults #BeginCharacters m" # Of the characters that you decide can go into words, this is # a list of characters that words can begin with. It should be # a subset of (or equal to) WordCharacters # Same rule of syntax as for WordCharacters #EndCharacters \"\ # Of the characters that you decide can go into words, this is # a list of characters that words can begin with. It should be # a subset of (or equal to) WordCharacters # Same rule of syntax as for WordCharacters # IgnoreLastChar # Array that contains the char that, if considered valid in the middle of # a word need to be disreguarded when at the end. It is important to also # set the given char's in the ENDCHARS array, otherwise the word will not # be indexed because considered invalid. # Commenting out the line will give the defaults # NOTE: if " is the first char in the string it needs to be escaped with \ # Do not escape otherwise # IgnoreFirstChar # Array that contains the char that, if considered valid in the middle of # a word need to be disreguarded when at the beginning. This was to solve # the problem of parenthesis when there is no space between ( and the # beginning of the word. # Remember to add the char's to the BEGINCHARS list also. # Commenting out the line will give the defaults # NOTE: if " is the first char in the string it needs to be escaped with \ # Do not escape otherwise IgnoreLimit 50 1000 # This automatically omits words that appear too often in the files # (these words are called stopwords). Specify a whole percentage # and a number, such as "80 256". This omits words that occur in # over 80% of the files and appear in over 256 files. Comment out # to turn of auto-stopwording. #IgnoreWords SwishDefault # The IgnoreWords option allows you to specify words to ignore. # Comment out for no stopwords; the word "SwishDefault" will # include a list of default stopwords. Words should be separated by spaces # and may span multiple directives. IndexComments 0 # This option allows the user decide if to index the comments in the files # default is 1. Set to 0 if comment indexing is not required. ################################## # DIRECTIVES for FILESYSTEMS ONLY # Comment out if using HTTP ################################### IndexOnly .html # Only files with these suffixes will be indexed. # NoContents .gif .xbm .au .mov .mpg .pdf .ps # Files with these suffixes will not have their contents indexed - # only their file names will be indexed. #FileRules pathname contains .*dir1 FileRules filename contains # % ~ .bak .orig .old old. #FileRules title contains construction example pointers FileRules directory contains .htaccess #FileRules filename is index # Files matching the above criteria will *not* be indexed. # The patter matching uses the C library regex.h ################################ # DIRECTIVES for HTTP METHOD ONLY # Comment out if using FILESYSTEM ################################## # MaxDepth 5 #(default 5) This defines how many links the spider should #follow before stopping. A value of 0 configures the spider to #traverse all links # Delay 60 #(default 60) The number of seconds to wait between issuing #requests to a server. # TmpDir /home/ghill/swishRon/ #(default /var/tmp) The location of a writeable temp directory #on your system. The HTTP access method tells the Perl helper to place #its files there. # SpiderDirectory /home/ghill/swishRon/src/ #(default ./) The location of the Perl helper #script. Remember, if you use a relative directory, it is relative to #your directory when you run SWISH-E, not to the directory that SWISH-E #is in. #EquivalentServer http://library.berkeley.edu http://www.lib.berkeley.edu #EquivalentServer http://sunsite.berkeley.edu:2000 http://sunsite.berkeley.edu #(default nothing) This allows you to deal with #servers that use respond to multiple DNS names. Each line should have #a list of all the method/names that should be considered equivalent. #If you have multiple directives, each one defines its own set of equivalent #servers.