Logfiles

Field Description
remotehost Remote hostname (or IP number if DNS hostname
is not available or if DNSLookup is off).
rfc931 The remote logname of the user if at all it
is present.
authuser The username of the remote user after
authentication by the HTTP server.
[date] Date and time of the request.
“request” The request, exactly as it came from the
browser or client.
status The HTTP status code the server sent back
to the client.
bytes The number of bytes (Content-Length)
transferred to the client.
fh = open('./files/nginx_logfile.log')
sample_logs = fh.readlines()

Hosts

import re
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
hosts = [re.search(host_pattern, item).group(1)
           if re.search(host_pattern, item)
           else 'no match'
           for item in sample_logs]
hosts[:5]
['188.138.41.208',
 '188.138.41.208',
 '188.138.41.208',
 '66.249.87.8',
 '188.138.41.208']

timestamps

ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'

import re
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
timestamps = [re.search(ts_pattern, item).group(1)
           if re.search(ts_pattern, item)
           else 'no match'
           for item in sample_logs]
timestamps[:5]
['no match', 'no match', 'no match', 'no match', 'no match']

Protokolle

method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups()
               if re.search(method_uri_protocol_pattern, item)
               else 'no match'
              for item in sample_logs]
method_uri_protocol[:5]
[('GET', '/static/css/navbar.css', 'HTTP/1.1'),
 ('GET', '/static/css/vocaword.css', 'HTTP/1.1'),
 ('GET', '/static/css/style.css', 'HTTP/1.1'),
 ('GET', '/archiv/2013_11_23', 'HTTP/1.1'),
 ('GET', '/archiv/2013_11_07', 'HTTP/1.1')]

status-Code

status_pattern = r'\s(\d{3})\s'
status = [re.search(status_pattern, item).group(1) for item in sample_logs]
for i in status[100:300]:
    if int(i) > 400:
        print(i, end=',')
404,404,404,404,

content size

content_size_pattern = r'\s(\d+)$'
#content_size = [re.search(content_size_pattern, item).group(1) for item in sample_logs]
content_size = [re.search(content_size_pattern, item).groups()
                if re.search(content_size_pattern, item)
                else 'no match'
                for item in sample_logs]
content_size[:5]
['no match', 'no match', 'no match', 'no match', 'no match']