First version of the script (#1)

Création du script

Co-authored-by: Hugo <saxodwarf@saxodwarf.fr>
Reviewed-on: #1
Co-authored-by: saxodwarf <saxodwarf@noreply.localhost>
Co-committed-by: saxodwarf <saxodwarf@noreply.localhost>
This commit is contained in:
saxodwarf 2021-09-04 16:06:39 +02:00
parent b2058509c8
commit c3eb31258e
1 changed files with 62 additions and 0 deletions

62
get_page_stats.py Normal file
View File

@ -0,0 +1,62 @@
#!/bin/env python3
""" Script to parse a sitemap.xml file,
then look through a NGINX log file for the number of hits for each of the URLs
defined in the sitemap, by unique IP.
"""
import re
import json
import argparse
from itertools import repeat
from subprocess import run
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
help="Path to the sitemap xml file for the website.")
parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
help="Path to the log file to analyze")
parser.add_argument("-e", "--exclude-crawler", action="store_true",
help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.")
return parser.parse_args()
def main():
""" Parses the arguments, the crawler file and the sitemap,
then for each locations, uses grep to select the lines containing GET calls for
the location, and prints the number of unique IP accessing it.
"""
args = parse_args()
if args.exclude_crawler:
try:
with open("./crawler-user-agents.json", 'r') as crawler_file:
crawlers = json.load(crawler_file)
except (FileNotFoundError, json.JSONDecodeError):
print("Could not open the crawler user agent file")
crawlers = []
else:
crawlers = []
# Crawlers patterns are built once and for all for speed
crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers]
locations = []
tree = ET.parse(args.sitemap)
root = tree.getroot()
# Get the default XML namespace, needed for tag lookup later
ns = re.match(r'{.*}', root.tag).group(0)
for url in root:
locations.append(urlparse(url.find(f"{ns}loc").text).path)
for path in locations:
# Pre-process log file using grep, to keep only interesting lines
cmd = ["grep", "-e", f'GET {path} ', args.logfile]
process = run(cmd, capture_output=True, text=True)
# Silmutaneously keep only unique source IP and exclude crawlers if resquested
lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
print(f"{path}: {len(lines)}")
if __name__ == "__main__":
main()