Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 269
- Log:
Overdue upgrade to AWStats v6.9, the most recent version
available at the time of writing.
- Author:
- rool
- Date:
- Wed May 27 23:57:15 +0100 2009
- Size:
- 10121 Bytes
- Properties:
- Property svn:executable is set
1 | #!/usr/bin/perl |
2 | #------------------------------------------------------- |
3 | # Small script to auto-generate URL Alias files for 5.2+ AWStats |
4 | # Requires two Perl modules below. |
5 | # From original title-grabber.pl file |
6 | # (Feedback/suggestions to: simonjw@users.sourceforge.net) |
7 | # Modified by eldy@users.sourceforge.net |
8 | # |
9 | # Note: If you want to retrieve document titles over SSL you must have OpenSSL and |
10 | # the Net::SSL(eay) Perl Module available. This code will check that SSL is |
11 | # supported before attempting to retrieve via it. |
12 | #------------------------------------------------------- |
13 | use LWP::UserAgent; |
14 | |
15 | use strict;no strict "refs"; |
16 | |
17 | |
18 | # variables, etc |
19 | my $REVISION='$Revision: 1.7 $'; $REVISION =~ /\s(.*)\s/; $REVISION=$1; |
20 | my $VERSION="1.0 (build $REVISION)"; |
21 | |
22 | ############### EDIT HERE ############### |
23 | |
24 | # you can set this manually if you will only grep one site |
25 | my $SITECONFIG = ""; |
26 | |
27 | # Where the default input is located. |
28 | my $awStatsDataDir = "/var/lib/awstats"; |
29 | |
30 | # Throttle HTTP requests - help avoid DoS-like results if on a quick network. |
31 | # Number is the number of seconds to pause between requests. Set to zero for |
32 | # no throttling. |
33 | my $throttleRequestsTime = 0; |
34 | |
35 | # LWP settings |
36 | # UA string passed to server. You should add this to SkipUserAgents in the |
37 | # awstats.conf file if you want to ignore hits from this code. |
38 | my $userAgent = "urlaliasbuilder/$VERSION"; |
39 | # Put a sensible e-mail address here |
40 | my $spiderOwner = "spider\@mydomain.com"; |
41 | |
42 | # Timeout (in seconds) for each HTTP request (increase on slow connections) |
43 | my $getTimeOut = 2; |
44 | # Proxy server to use when doing http/s - leave blank if you don't have one |
45 | #my $proxyServer = "http://my.proxy.server:port/"; |
46 | my $proxyServer = ""; |
47 | # Hosts not to use a proxy for |
48 | my @hostsNoProxy = ("host1","host1.my.domain.name"); |
49 | # Make sure we don't download multi-megabyte files! We need only head section |
50 | my $maxDocSizeBytes = 4096; # number is bytes |
51 | |
52 | ############### DON'T EDIT BELOW HERE ############### |
53 | |
54 | # Don't edit these |
55 | my $FILEMARKER1 = "BEGIN_SIDER"; |
56 | my $FILEMARKER2 = "END_SIDER"; |
57 | |
58 | my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); |
59 | |
60 | my $fullMonth = sprintf("%02d",$mon+1); |
61 | my $fullYear = sprintf("%04d",$year+1900); |
62 | |
63 | |
64 | # ====== main ====== |
65 | |
66 | # Change default value if options are used |
67 | my $helpfound=0; |
68 | my $nohosts=0; |
69 | my $overwritedata=0; |
70 | my $hostname=""; |
71 | my $useHTTPS=0; |
72 | |
73 | # Data file to open |
74 | my $fileToOpen = $awStatsDataDir . "/awstats" . $fullMonth . $fullYear . ($SITECONFIG?".$SITECONFIG":"") . ".txt"; |
75 | # URL Alias file to open |
76 | my $urlAliasFile = "urlalias" . ($SITECONFIG?".$SITECONFIG":"") . ".txt"; |
77 | |
78 | for (0..@ARGV-1) { |
79 | if ($ARGV[$_] =~ /^-*urllistfile=([^\s&]+)/i) { $fileToOpen="$1"; next; } |
80 | if ($ARGV[$_] =~ /^-*urlaliasfile=([^\s&]+)/i) { $urlAliasFile="$1"; next; } |
81 | if ($ARGV[$_] =~ /^-*site=(.*)/i) { $hostname="$1"; next; } |
82 | if ($ARGV[$_] =~ /^-*h/i) { $helpfound=1; next; } |
83 | if ($ARGV[$_] =~ /^-*overwrite/i) { $overwritedata=1; next; } |
84 | if ($ARGV[$_] =~ /^-*secure/i) { $useHTTPS=1; next; } |
85 | } |
86 | |
87 | # if no host information provided, we bomb out to usage |
88 | if(! $hostname && ! $SITECONFIG) { $nohosts=1; } |
89 | |
90 | # if no hostname set (i.e. -site=) then we use the config value |
91 | if(! $hostname && $SITECONFIG) { $hostname=$SITECONFIG; } |
92 | |
93 | # Show usage help |
94 | my $DIR; my $PROG; my $Extension; |
95 | ($DIR=$0) =~ s/([^\/\\]*)$//; ($PROG=$1) =~ s/\.([^\.]*)$//; $Extension=$1; |
96 | if ($nohosts || $helpfound || ! @ARGV) { |
97 | print "\n----- $PROG $VERSION -----\n"; |
98 | print ucfirst($PROG)." generates an 'urlalias' file from an input file.\n"; |
99 | print "The input file must contain a list of URLs (It can be an AWStats history file).\n"; |
100 | print "For each of thoose URLs, the script get the corresponding HTML page and catch the\n"; |
101 | print "header information (title), then it writes an output file that contains one line\n"; |
102 | print "for each URLs and several fields:\n"; |
103 | print "- The first field is the URL,\n"; |
104 | print "- The second is title caught from web page.\n"; |
105 | print "This resulting file can be used by AWStats urlalias plugin.\n"; |
106 | print "\n"; |
107 | print "Usage: $PROG.$Extension -site=www.myserver.com [options]\n"; |
108 | print "\n"; |
109 | print "The site parameter contains the web server to get the page from.\n"; |
110 | print "Where options are:\n"; |
111 | print " -urllistfile=Input urllist file\n"; |
112 | print " If this file is an AWStats history file then urlaliasbuilder will use the\n"; |
113 | print " SIDER section of this file as its input URL's list.\n"; |
114 | print " -urlaliasfile=Output urlalias file to build\n"; |
115 | print " -overwrite Overwrite output file if exists\n"; |
116 | print " -secure Use https protocol\n"; |
117 | print "\n"; |
118 | print "Example: $PROG.$Extension -site=www.someotherhost.com\n"; |
119 | print "\n"; |
120 | print "This is default configuration used when no option are provided on command line:\n"; |
121 | print "Input urllist file: $fileToOpen (overwritten by -urllistfile option)\n"; |
122 | print "Output urlalias file: $urlAliasFile (overwritten by -urlaliasfile option)\n"; |
123 | print "\n"; |
124 | print "This script was written from Simon Waight original works title-grabber.pl.\n"; |
125 | print "\n"; |
126 | exit 0; |
127 | } |
128 | |
129 | my @archivedKeys=(); |
130 | my $counter = 0; |
131 | my $pageTitle = ""; |
132 | |
133 | # only read the alias file if we want to do a comparison |
134 | # and append new items only (i.e. not overwrite) |
135 | if($overwritedata == 0) { |
136 | open(FILE,$urlAliasFile); |
137 | my @bits = (); |
138 | while(<FILE>) { |
139 | chomp $_; s/\r//; |
140 | @bits=split(/\t/,$_); |
141 | @archivedKeys[$counter]=@bits[0]; |
142 | $counter++; |
143 | #print "key: " . @bits[0] . "\n"; |
144 | } |
145 | close(FILE); |
146 | @bits = (); |
147 | } |
148 | |
149 | # open input file (might be an AWStats history data file) |
150 | print "Reading input file: $fileToOpen\n"; |
151 | open(FILE,$fileToOpen) || die "Error: Can't open input urllist file $fileToOpen"; |
152 | binmode FILE; |
153 | |
154 | my @field=(); |
155 | my @addToAliasFile=(); |
156 | my $addToAliasFileCount=0; |
157 | my $isawstatshistoryfile=0; |
158 | while (<FILE>) { |
159 | chomp $_; s/\r//; |
160 | |
161 | if ($_ =~ /^AWSTATS DATA FILE/) { |
162 | print "This file looks like an AWStats history file. Searching URLs list...\n"; |
163 | $isawstatshistoryfile=1; |
164 | } |
165 | |
166 | # Split line out into fields |
167 | @field=split(/\s+/,$_); |
168 | if (! $field[0]) { next; } |
169 | |
170 | # If we're at the start of the URL section of file |
171 | if (! $isawstatshistoryfile || $field[0] eq $FILEMARKER1) { |
172 | |
173 | $_=<FILE>; |
174 | chomp $_; s/\r//; |
175 | |
176 | my @field=split(/\s+/,$_); |
177 | my $count=0; |
178 | my $matched = 0; |
179 | while ($field[0] ne $FILEMARKER2) { |
180 | if ($field[0]) { |
181 | # compare awstats data entry against urlalias entry |
182 | # only if we don't just want to write current items |
183 | # to the file (i.e. overwrite) |
184 | if($overwritedata == 0) { |
185 | foreach my $key (@archivedKeys) { |
186 | if($field[0] eq $key) { |
187 | $matched = 1; |
188 | last; |
189 | } |
190 | } |
191 | # it's a new URL, so add to list of items to retrieve |
192 | if($matched == 0) { |
193 | @addToAliasFile[$addToAliasFileCount] = $field[0]; |
194 | $addToAliasFileCount++; |
195 | #print "new: " . $field[0] . "\n" |
196 | } |
197 | $matched = 0; |
198 | } else { |
199 | # no comparison, so everything is 'new' |
200 | @addToAliasFile[$addToAliasFileCount] = $field[0]; |
201 | $addToAliasFileCount++; |
202 | } |
203 | } |
204 | $_=<FILE>; |
205 | chomp $_; s/\r//; |
206 | @field=split(/\s+/,$_); |
207 | } |
208 | } |
209 | } |
210 | |
211 | close(FILE); |
212 | |
213 | if($addToAliasFileCount == 0) { |
214 | print "Found no new documents.\n\n" ; |
215 | exit(); |
216 | } |
217 | |
218 | print "Found " . $addToAliasFileCount . " new documents with no alias.\n"; |
219 | |
220 | my $fileOutput = ""; |
221 | |
222 | print "Looking thoose pages on web site '$hostname' to get alias...\n"; |
223 | |
224 | # Create a user agent (browser) object |
225 | my $ua = new LWP::UserAgent; |
226 | # set user agent name |
227 | $ua->agent($userAgent); |
228 | # set user agents owners e-mail address |
229 | $ua->from($spiderOwner); |
230 | # set timeout for requests |
231 | $ua->timeout($getTimeOut); |
232 | if ($proxyServer) { |
233 | # set proxy for access to external sites |
234 | $ua->proxy(["http","https"],$proxyServer); |
235 | # avoid proxy for these hosts |
236 | $ua->no_proxy(@hostsNoProxy); |
237 | } |
238 | # set maximum size of document to retrieve (in bytes) |
239 | $ua->max_size($maxDocSizeBytes); |
240 | if(!($ua->is_protocol_supported('https')) && $useHTTPS) { |
241 | print "SSL is not supported on this machine.\n\n"; |
242 | exit(); |
243 | } |
244 | |
245 | my $fileOutput = ""; |
246 | |
247 | # Now lets build the contents to write (or append) to urlalias file |
248 | foreach my $newAlias (@addToAliasFile) { |
249 | sleep $throttleRequestsTime; |
250 | my $newAliasEntry = &Generate_Alias_List_Entry($newAlias); |
251 | $fileOutput .= $newAliasEntry . "\n"; |
252 | } |
253 | |
254 | # write the data back to urlalias file |
255 | if (! $overwritedata) { |
256 | # Append to file |
257 | open(FILE,">>$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n"; |
258 | print FILE $fileOutput; |
259 | close(FILE); |
260 | } else { |
261 | # Overwrite the file |
262 | open(FILE,">$urlAliasFile") || die "Error: Failed to open file for writing: $_\n\n"; |
263 | foreach my $newAlias (@addToAliasFile) { |
264 | my $newAliasEntry = &Generate_Alias_List_Entry($newAlias); |
265 | print FILE "$newAliasEntry\n"; |
266 | } |
267 | close(FILE); |
268 | } |
269 | print "File $urlAliasFile created/updated.\n"; |
270 | |
271 | exit(); |
272 | |
273 | #--------------------------- End of Main ----------------------------- |
274 | |
275 | |
276 | # |
277 | # Generate new lines for urlalias file by doing a http get using data |
278 | # supplied. |
279 | # |
280 | sub Generate_Alias_List_Entry { |
281 | |
282 | # take in the path & document |
283 | my $urltoget = shift; |
284 | |
285 | my $urlPrefix = "http://"; |
286 | |
287 | if($useHTTPS) { |
288 | $urlPrefix = "https://"; |
289 | } |
290 | |
291 | my $AliasLine = ""; |
292 | $pageTitle = ""; |
293 | $AliasLine = $urltoget; |
294 | $AliasLine .= "\t"; |
295 | |
296 | # build a full HTTP request to pass to user agent |
297 | my $fullurltoget = $urlPrefix . $hostname . $urltoget; |
298 | |
299 | # Create a HTTP request |
300 | print "Getting page $fullurltoget\n"; |
301 | |
302 | my $req = new HTTP::Request GET => $fullurltoget; |
303 | |
304 | # Pass request to the user agent and get a response back |
305 | my $res = $ua->request($req); |
306 | |
307 | # Parse returned document for page title |
308 | if ($res->is_success()) { |
309 | $pageTitle = $res->title; |
310 | } else { |
311 | print "Failed to get page: ".$res->status_line."\n"; |
312 | $pageTitle = "Unknown Title"; |
313 | } |
314 | if ($pageTitle eq "") { |
315 | $pageTitle = "Unknown Title"; |
316 | } |
317 | return $AliasLine . $pageTitle; |
318 | } |