ditch grep and make all filtering in pure python

This commit is contained in:
Hugo 2021-09-11 17:33:06 +02:00
parent 575c7e89db
commit 3e364d5f51
2 changed files with 68 additions and 12 deletions

View File

@ -1,3 +1,5 @@
# pages_stats
Simple script to gather daily global statistics for hugo post served
Simple script to gather daily global statistics for hugo post served.
The crawler-user-agents.json file comes from [this project](https://github.com/monperrus/crawler-user-agents/).

View File

@ -6,6 +6,7 @@ defined in the sitemap, by unique IP.
import re
import json
import argparse
from collections import defaultdict
from itertools import repeat
from subprocess import run
from urllib.parse import urlparse
@ -14,7 +15,7 @@ import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml",
help="Path to the sitemap xml file for the website.")
parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
help="Path to the log file to analyze")
@ -24,8 +25,10 @@ def parse_args():
def main():
""" Parses the arguments, the crawler file and the sitemap,
then for each locations, uses grep to select the lines containing GET calls for
the location, and prints the number of unique IP accessing it.
Then reads the log file line by line, regexes through it to isolate locations and client IP
It records the number of unique IP accessing each known pages (from the sitemap), and
the number of unique IP accessing each unknown locations.
(either ressources being loaded or bot looking for vulnerable website).
"""
args = parse_args()
@ -49,14 +52,65 @@ def main():
for url in root:
locations.append(urlparse(url.find(f"{ns}loc").text).path)
for path in locations:
# Pre-process log file using grep, to keep only interesting lines
cmd = ["grep", "-e", f'GET {path} ', args.logfile]
process = run(cmd, capture_output=True, text=True)
# Silmutaneously keep only unique source IP and exclude crawlers if resquested
lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
print(f"{path}: {len(lines)}")
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations))))
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
visit_dict = dict(map(lambda x: (x, set()), locations))
bot_visit_dict = dict(map(lambda x: (x, set()), locations))
other_visits = defaultdict(set)
with open(args.logfile, 'r') as logfile:
for line in logfile:
match_obj = re.match(known_page_regex, line)
if match_obj:
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
if not any(map(re.search, crawler_patterns, repeat(line))):
visit_dict[location].add(client_ip)
else:
bot_visit_dict[location].add(client_ip)
else:
match_obj = re.match(other_pages_regex, line)
if match_obj:
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
if location.startswith("/isso/"):
other_visits["/isso/*"].add(client_ip)
elif location.startswith("/assets/css/"):
other_visits["/assets/css/*"].add(client_ip)
elif location.startswith("/assets/js/"):
other_visits["/assets/js/*"].add(client_ip)
elif location.startswith("/images/"):
other_visits["/images/*"].add(client_ip)
else:
other_visits[location.split('?')[0]].add(client_ip)
total_visits=0
print("Standard visits:")
for loc, ips in visit_dict.items():
print(f"{loc}: {len(ips)}")
total_visits += len(ips)
print(f'Total visits: {total_visits}')
if args.exclude_crawler:
print("Bot visits:")
for loc, ips in bot_visit_dict.items():
print(f"{loc}: {len(ips)}")
nb_other_visits = 0
print("Other visits:")
for loc, ips in other_visits.items():
print(f"{loc}: {len(ips)}")
nb_other_visits += len(ips)
print(f'Total visits: {total_visits}')
print(f'Other visits: {nb_other_visits}')
#for path in locations:
# # Pre-process log file using grep, to keep only interesting lines
# cmd = ["grep", "-e", f'GET {path} ', args.logfile]
# process = run(cmd, capture_output=True, text=True)
# # Silmutaneously keep only unique source IP and exclude crawlers if resquested
# lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
#
# print(f"{path}: {len(lines)}")
if __name__ == "__main__":
main()