#!/usr/bin/env python3 """ Script to split existing nginx logs per year/month with verbose output. This script accepts a configurable directory (defaulting to the current working directory) of nginx logs. It looks for files named .access_log and .error_log. Each log line is expected to contain a timestamp inside square brackets, e.g., [10/Oct/2000:13:55:36 -0700]. The script extracts the date and writes each line into a separate file under a subdirectory formatted as YYYY-MM. Verbose output is provided via rich progress bars and spinners. Usage: python split_nginx_logs.py --dir /path/to/logs """ import argparse import re from datetime import datetime from pathlib import Path from typing import TextIO, Dict, Optional from rich.console import Console from rich.progress import ( Progress, SpinnerColumn, BarColumn, TextColumn, TimeRemainingColumn, TaskID, ) # Regular expression to extract date part from log entry. DATE_REGEX = re.compile(r'\[([\d]{2}/[A-Za-z]{3}/[\d]{4}):') console = Console() def process_log_file( file_path: Path, base_dir: Path, progress: Optional[Progress] = None, task_id: Optional[TaskID] = None, ) -> None: """Split a given nginx log file per year/month. Reads the log file line by line, extracts the timestamp, and writes each line to its corresponding output file. The output file is created under a subdirectory in the base directory named as and retains the original file name. Args: file_path: Path to the original log file. base_dir: Base directory for output files. progress: Optional rich Progress instance for updating progress. task_id: Optional rich TaskID for progress update. """ out_files: Dict[str, TextIO] = {} try: with file_path.open('r', encoding='utf-8') as fin: for line in fin: match = DATE_REGEX.search(line) if match: date_str = match.group(1) # e.g., '10/Oct/2000' try: date_obj = datetime.strptime(date_str, '%d/%b/%Y') key = f'{date_obj.year}-{date_obj.month:02d}' except ValueError: key = 'unknown' else: key = 'unknown' out_path = base_dir / key / file_path.name if key not in out_files: out_path.parent.mkdir(parents=True, exist_ok=True) out_files[key] = out_path.open('w', encoding='utf-8') out_files[key].write(line) if progress and task_id is not None: # Advance progress by the number of bytes processed progress.advance(task_id, len(line.encode('utf-8'))) finally: # Ensure all open file handles are closed. for f in out_files.values(): f.close() def split_nginx_logs(directory: Path) -> None: """Split all nginx log files in the given directory per year/month. It looks for files named .access_log and .error_log in the specified directory and processes them with verbose output. Args: directory: The directory where the log files are located. """ log_files = [] for pattern in ['*.access_log', '*.error_log']: log_files.extend(list(directory.glob(pattern))) if not log_files: console.print('[bold red]No log files found.[/bold red]') return console.print(f'[bold green]Found {len(log_files)} log files.[/bold green]') # Create a progress bar for file processing. with Progress( SpinnerColumn(), '[progress.description]{task.description}', BarColumn(), '[progress.percentage]{task.percentage:>3.0f}%', TimeRemainingColumn(), console=console, transient=True, ) as progress: for log_file in log_files: try: file_size = log_file.stat().st_size except Exception: file_size = 0 task_description = f'Processing [cyan]{log_file.name}[/cyan]' task_id = progress.add_task(task_description, total=file_size) process_log_file(log_file, directory, progress, task_id) progress.remove_task(task_id) console.print(f'[green]Finished processing:[/green] {log_file}') console.print('[bold blue]All files processed successfully.[/bold blue]') def parse_arguments() -> argparse.Namespace: """Parse command-line arguments. Returns: The parsed arguments with the directory path. """ parser = argparse.ArgumentParser( description='Split nginx logs per year/month with verbose logging.' ) parser.add_argument( '--dir', type=str, default='.', help='Directory containing the nginx log files (default: current ' 'working directory)', ) return parser.parse_args() def main() -> None: """Main entry point for splitting nginx logs.""" args = parse_arguments() directory = Path(args.dir).resolve() if not directory.exists() or not directory.is_dir(): console.print( f'[bold red]Error:[/bold red] {directory} is not a valid directory.' ) return console.print('[bold green]Starting log splitting...[/bold green]') split_nginx_logs(directory) if __name__ == '__main__': main()