#!/usr/bin/env python
# -*- coding: iso8859-2 -*-
#
# Parse apache access_log file for Google activity.
# Usage: google.py *access_log
#
# (C) Marek /Flash/ Wywiał <flash@irc.pl>
#

# using REgular expressions and SYStem functions
import re,sys

# test if access_log files given
if len(sys.argv) < 2:
	print "Usage: %s <apache_access_log_file(s)>" % (sys.argv[0])
	sys.exit()

# for all files in command line
for log_file in sys.argv[1:]:

	# open this file
	try:
		file = open(log_file)
	except:
		print "Error: could not open file %s" % (sys.argv[1])
		sys.exit()
	
	# print title and wait for [enter]
	print "==> %s --------------------------- [enter]" % (log_file),
	raw_input()

	# reading file line by line
	for line in file.readlines():

		# match for the 'combined' log format
		combined = re.match('^(\S+)\s+(\S+)\s+(\S+)\s+\[([^\]]+)\]\s+"([^"]+)"\s+(\d+)\s+(\S+)\s+"([^"]+)"\s+"([^"]+)"$', line)

		# if 'combined' log format and the user-agent is 'Google', show it
		if combined and combined.groups()[8].find('Google') > -1:
			log = combined.groups()
			print "date: %s\nfile: %s\nagent: %s\n" % (log[3], log[4], log[8])
	
	# close the file, of course
	file.close()

# CU
sys.exit()
