diff --git a/README.md b/README.md index 57b030a..1e9e482 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # pages_stats -Simple script to gather daily global statistics for hugo post served \ No newline at end of file +Simple script to gather daily global statistics for hugo post served. + +The crawler-user-agents.json file comes from [this project](https://github.com/monperrus/crawler-user-agents/). diff --git a/get_page_stats.py b/get_page_stats.py index 6dcba4b..ae2ecf9 100644 --- a/get_page_stats.py +++ b/get_page_stats.py @@ -6,6 +6,7 @@ defined in the sitemap, by unique IP. import re import json import argparse +from collections import defaultdict from itertools import repeat from subprocess import run from urllib.parse import urlparse @@ -14,7 +15,7 @@ import xml.etree.ElementTree as ET def parse_args(): parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.') - parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml", + parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml", help="Path to the sitemap xml file for the website.") parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1", help="Path to the log file to analyze") @@ -24,8 +25,10 @@ def parse_args(): def main(): """ Parses the arguments, the crawler file and the sitemap, - then for each locations, uses grep to select the lines containing GET calls for - the location, and prints the number of unique IP accessing it. + Then reads the log file line by line, regexes through it to isolate locations and client IP + It records the number of unique IP accessing each known pages (from the sitemap), and + the number of unique IP accessing each unknown locations. + (either ressources being loaded or bot looking for vulnerable website). """ args = parse_args() @@ -49,14 +52,65 @@ def main(): for url in root: locations.append(urlparse(url.find(f"{ns}loc").text).path) - for path in locations: - # Pre-process log file using grep, to keep only interesting lines - cmd = ["grep", "-e", f'GET {path} ', args.logfile] - process = run(cmd, capture_output=True, text=True) - # Silmutaneously keep only unique source IP and exclude crawlers if resquested - lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} - - print(f"{path}: {len(lines)}") + log_line_template = r'^(?P[0-9a-f.:]+) .*"GET (?P{locations}) .*' + known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations)))) + other_pages_regex = re.compile(log_line_template.format(locations='.+?')) + visit_dict = dict(map(lambda x: (x, set()), locations)) + bot_visit_dict = dict(map(lambda x: (x, set()), locations)) + other_visits = defaultdict(set) + with open(args.logfile, 'r') as logfile: + for line in logfile: + match_obj = re.match(known_page_regex, line) + if match_obj: + client_ip = match_obj.group("ip_address") + location = match_obj.group("location") + if not any(map(re.search, crawler_patterns, repeat(line))): + visit_dict[location].add(client_ip) + else: + bot_visit_dict[location].add(client_ip) + else: + match_obj = re.match(other_pages_regex, line) + if match_obj: + client_ip = match_obj.group("ip_address") + location = match_obj.group("location") + if location.startswith("/isso/"): + other_visits["/isso/*"].add(client_ip) + elif location.startswith("/assets/css/"): + other_visits["/assets/css/*"].add(client_ip) + elif location.startswith("/assets/js/"): + other_visits["/assets/js/*"].add(client_ip) + elif location.startswith("/images/"): + other_visits["/images/*"].add(client_ip) + else: + other_visits[location.split('?')[0]].add(client_ip) + + + total_visits=0 + print("Standard visits:") + for loc, ips in visit_dict.items(): + print(f"{loc}: {len(ips)}") + total_visits += len(ips) + print(f'Total visits: {total_visits}') + if args.exclude_crawler: + print("Bot visits:") + for loc, ips in bot_visit_dict.items(): + print(f"{loc}: {len(ips)}") + nb_other_visits = 0 + print("Other visits:") + for loc, ips in other_visits.items(): + print(f"{loc}: {len(ips)}") + nb_other_visits += len(ips) + print(f'Total visits: {total_visits}') + print(f'Other visits: {nb_other_visits}') + + #for path in locations: + # # Pre-process log file using grep, to keep only interesting lines + # cmd = ["grep", "-e", f'GET {path} ', args.logfile] + # process = run(cmd, capture_output=True, text=True) + # # Silmutaneously keep only unique source IP and exclude crawlers if resquested + # lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} + # + # print(f"{path}: {len(lines)}") if __name__ == "__main__": main()