Friday 18 March 2016

Python Extracting 9 digit number from tomcat access log files into a unique set and printing to the screen

I had to extract all the 9 digit IDs that were used in a tomcat access log. Here is how I solved it with Python
import re
import os

from collections import Counter
# Compile the regular expressions \d{9} selects numbers that are 9 digits longs
p = re.compile('\d{9}', re.IGNORECASE)
output_file_name = 'output.csv'
# Initiate an array to store the students ids in
most_active = []
#for the files in this directory
for name in os.listdir("."):
    #if it starts with localhost_access_log & ends with a .txt extension
    if name.startswith("localhost_access_log") and name.endswith(".txt"):
        #read its contents
        with open(name, 'r') as myfile:
            print("Reading file: "+name)
            data=myfile.read().replace('\n', '')
            #Add the regex matches to the array
            most_active = most_active + re.findall(p, data)
# Count how many times a student id is repeated in the list
# This will tell us the most active students in the app
count = Counter(most_active)

# For each of the students, order them from most common to least
output_file = open(output_file_name,"w")
output_file.write("Student ID, Activity\n")
for student in count.most_common():
    line = str(student[0])+","+str(student[1])
    #print(line)
    
    output_file.write(line+"\n")
output_file.close()
print("File written successfully: "+output_file_name)
#Print out the total amount of students
print("Total Students:"+ str(len(count)))


No comments:

Post a Comment