ditch grep and make all filtering in pure python
This commit is contained in:
parent
575c7e89db
commit
3e364d5f51
|
|
@ -1,3 +1,5 @@
|
||||||
# pages_stats
|
# pages_stats
|
||||||
|
|
||||||
Simple script to gather daily global statistics for hugo post served
|
Simple script to gather daily global statistics for hugo post served.
|
||||||
|
|
||||||
|
The crawler-user-agents.json file comes from [this project](https://github.com/monperrus/crawler-user-agents/).
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ defined in the sitemap, by unique IP.
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import argparse
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from subprocess import run
|
from subprocess import run
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
@ -14,7 +15,7 @@ import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
|
parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
|
||||||
parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
|
parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml",
|
||||||
help="Path to the sitemap xml file for the website.")
|
help="Path to the sitemap xml file for the website.")
|
||||||
parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
|
parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
|
||||||
help="Path to the log file to analyze")
|
help="Path to the log file to analyze")
|
||||||
|
|
@ -24,8 +25,10 @@ def parse_args():
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
""" Parses the arguments, the crawler file and the sitemap,
|
""" Parses the arguments, the crawler file and the sitemap,
|
||||||
then for each locations, uses grep to select the lines containing GET calls for
|
Then reads the log file line by line, regexes through it to isolate locations and client IP
|
||||||
the location, and prints the number of unique IP accessing it.
|
It records the number of unique IP accessing each known pages (from the sitemap), and
|
||||||
|
the number of unique IP accessing each unknown locations.
|
||||||
|
(either ressources being loaded or bot looking for vulnerable website).
|
||||||
"""
|
"""
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
|
@ -49,14 +52,65 @@ def main():
|
||||||
for url in root:
|
for url in root:
|
||||||
locations.append(urlparse(url.find(f"{ns}loc").text).path)
|
locations.append(urlparse(url.find(f"{ns}loc").text).path)
|
||||||
|
|
||||||
for path in locations:
|
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
|
||||||
# Pre-process log file using grep, to keep only interesting lines
|
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations))))
|
||||||
cmd = ["grep", "-e", f'GET {path} ', args.logfile]
|
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
|
||||||
process = run(cmd, capture_output=True, text=True)
|
visit_dict = dict(map(lambda x: (x, set()), locations))
|
||||||
# Silmutaneously keep only unique source IP and exclude crawlers if resquested
|
bot_visit_dict = dict(map(lambda x: (x, set()), locations))
|
||||||
lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
|
other_visits = defaultdict(set)
|
||||||
|
with open(args.logfile, 'r') as logfile:
|
||||||
print(f"{path}: {len(lines)}")
|
for line in logfile:
|
||||||
|
match_obj = re.match(known_page_regex, line)
|
||||||
|
if match_obj:
|
||||||
|
client_ip = match_obj.group("ip_address")
|
||||||
|
location = match_obj.group("location")
|
||||||
|
if not any(map(re.search, crawler_patterns, repeat(line))):
|
||||||
|
visit_dict[location].add(client_ip)
|
||||||
|
else:
|
||||||
|
bot_visit_dict[location].add(client_ip)
|
||||||
|
else:
|
||||||
|
match_obj = re.match(other_pages_regex, line)
|
||||||
|
if match_obj:
|
||||||
|
client_ip = match_obj.group("ip_address")
|
||||||
|
location = match_obj.group("location")
|
||||||
|
if location.startswith("/isso/"):
|
||||||
|
other_visits["/isso/*"].add(client_ip)
|
||||||
|
elif location.startswith("/assets/css/"):
|
||||||
|
other_visits["/assets/css/*"].add(client_ip)
|
||||||
|
elif location.startswith("/assets/js/"):
|
||||||
|
other_visits["/assets/js/*"].add(client_ip)
|
||||||
|
elif location.startswith("/images/"):
|
||||||
|
other_visits["/images/*"].add(client_ip)
|
||||||
|
else:
|
||||||
|
other_visits[location.split('?')[0]].add(client_ip)
|
||||||
|
|
||||||
|
|
||||||
|
total_visits=0
|
||||||
|
print("Standard visits:")
|
||||||
|
for loc, ips in visit_dict.items():
|
||||||
|
print(f"{loc}: {len(ips)}")
|
||||||
|
total_visits += len(ips)
|
||||||
|
print(f'Total visits: {total_visits}')
|
||||||
|
if args.exclude_crawler:
|
||||||
|
print("Bot visits:")
|
||||||
|
for loc, ips in bot_visit_dict.items():
|
||||||
|
print(f"{loc}: {len(ips)}")
|
||||||
|
nb_other_visits = 0
|
||||||
|
print("Other visits:")
|
||||||
|
for loc, ips in other_visits.items():
|
||||||
|
print(f"{loc}: {len(ips)}")
|
||||||
|
nb_other_visits += len(ips)
|
||||||
|
print(f'Total visits: {total_visits}')
|
||||||
|
print(f'Other visits: {nb_other_visits}')
|
||||||
|
|
||||||
|
#for path in locations:
|
||||||
|
# # Pre-process log file using grep, to keep only interesting lines
|
||||||
|
# cmd = ["grep", "-e", f'GET {path} ', args.logfile]
|
||||||
|
# process = run(cmd, capture_output=True, text=True)
|
||||||
|
# # Silmutaneously keep only unique source IP and exclude crawlers if resquested
|
||||||
|
# lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
|
||||||
|
#
|
||||||
|
# print(f"{path}: {len(lines)}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue