import re
import os
from collections import Counter
# Compile the regular expressions \d{9} selects numbers that are 9 digits longs
p = re.compile('\d{9}', re.IGNORECASE)
output_file_name = 'output.csv'
# Initiate an array to store the students ids in
most_active = []
#for the files in this directory
for name in os.listdir("."):
#if it starts with localhost_access_log & ends with a .txt extension
if name.startswith("localhost_access_log") and name.endswith(".txt"):
#read its contents
with open(name, 'r') as myfile:
print("Reading file: "+name)
data=myfile.read().replace('\n', '')
#Add the regex matches to the array
most_active = most_active + re.findall(p, data)
# Count how many times a student id is repeated in the list
# This will tell us the most active students in the app
count = Counter(most_active)
# For each of the students, order them from most common to least
output_file = open(output_file_name,"w")
output_file.write("Student ID, Activity\n")
for student in count.most_common():
line = str(student[0])+","+str(student[1])
#print(line)
output_file.write(line+"\n")
output_file.close()
print("File written successfully: "+output_file_name)
#Print out the total amount of students
print("Total Students:"+ str(len(count)))
Friday, 18 March 2016
Python Extracting 9 digit number from tomcat access log files into a unique set and printing to the screen
I had to extract all the 9 digit IDs that were used in a tomcat access log. Here is how I solved it with Python
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment