Ultimate File Framework

Process, convert, validate, and manage files across formats using Python. This skill covers file I/O operations, format detection, metadata extraction, batch processing pipelines, archive handling, and secure file validation for building robust file management systems.

When to Use This Skill

Choose Ultimate File Framework when you need to:

Read, write, and convert between file formats (CSV, JSON, XML, YAML, TOML, Parquet)
Build batch file processing pipelines with validation and error handling
Extract metadata and detect file types regardless of extension
Handle archives (ZIP, TAR, GZIP) and large file streaming

Consider alternatives when:

You need image-specific processing (use Pillow or OpenCV)
You need PDF parsing and generation (use pdfplumber or ReportLab)
You need cloud storage file management (use boto3, azure-storage, or gcloud)

Quick Start


pip install python-magic pyyaml toml pandas


import os
import json
import csv
import hashlib
from pathlib import Path
from typing import Dict, List, Optional

class FileProcessor:
    """Universal file processor with format detection and conversion."""

    READERS = {
        '.json': '_read_json',
        '.csv': '_read_csv',
        '.yaml': '_read_yaml',
        '.yml': '_read_yaml',
        '.toml': '_read_toml',
        '.txt': '_read_text',
    }

    def __init__(self, base_dir: str = '.'):
        self.base_dir = Path(base_dir)

    def read(self, filepath: str) -> dict:
        """Read file with automatic format detection."""
        path = Path(filepath)
        ext = path.suffix.lower()
        reader = self.READERS.get(ext)
        if not reader:
            raise ValueError(f"Unsupported format: {ext}")
        return getattr(self, reader)(path)

    def file_info(self, filepath: str) -> dict:
        """Get comprehensive file metadata."""
        path = Path(filepath)
        stat = path.stat()
        with open(path, 'rb') as f:
            content = f.read()
        return {
            'name': path.name,
            'extension': path.suffix,
            'size_bytes': stat.st_size,
            'size_human': self._human_size(stat.st_size),
            'modified': stat.st_mtime,
            'md5': hashlib.md5(content).hexdigest(),
            'sha256': hashlib.sha256(content).hexdigest(),
            'lines': content.count(b'\n') if self._is_text(path) else None,
        }

    def _read_json(self, path):
        with open(path) as f:
            return json.load(f)

    def _read_csv(self, path):
        with open(path) as f:
            return list(csv.DictReader(f))

    def _read_yaml(self, path):
        import yaml
        with open(path) as f:
            return yaml.safe_load(f)

    def _read_toml(self, path):
        import toml
        with open(path) as f:
            return toml.load(f)

    def _read_text(self, path):
        with open(path) as f:
            return {'content': f.read()}

    def _is_text(self, path):
        try:
            with open(path, 'r') as f:
                f.read(1024)
            return True
        except UnicodeDecodeError:
            return False

    @staticmethod
    def _human_size(size):
        for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
            if size < 1024:
                return f"{size:.1f} {unit}"
            size /= 1024

fp = FileProcessor()
# info = fp.file_info("data.json")
# data = fp.read("config.yaml")

Core Concepts

Format Comparison

Format	Best For	Human Readable	Schema Support	Streaming
JSON	APIs, config, nested data	Yes	JSON Schema	Limited
CSV	Tabular data, spreadsheets	Yes	No	Yes
YAML	Config files, DevOps	Yes	No	Yes
TOML	Simple config (pyproject.toml)	Yes	No	No
Parquet	Large analytics datasets	No	Built-in	Yes
XML	Document markup, legacy systems	Yes	XSD	Yes
MessagePack	Fast binary serialization	No	No	Yes
Protocol Buffers	RPC, typed binary data	No	Built-in	Yes

Batch File Processing Pipeline


import os
import json
import shutil
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, List

class BatchProcessor:
    """Process files in bulk with validation and error handling."""

    def __init__(self, input_dir: str, output_dir: str, error_dir: str = None):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.error_dir = Path(error_dir or f"{output_dir}_errors")
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.error_dir.mkdir(parents=True, exist_ok=True)
        self.stats = {'processed': 0, 'errors': 0, 'skipped': 0}

    def process(self, pattern: str, handler: Callable,
                max_workers: int = 4, validate: Callable = None):
        """Process files matching pattern with the handler function."""
        files = list(self.input_dir.glob(pattern))
        print(f"Found {len(files)} files matching '{pattern}'")

        def process_file(filepath):
            try:
                if validate and not validate(filepath):
                    self.stats['skipped'] += 1
                    return

                result = handler(filepath)
                output_path = self.output_dir / filepath.name

                if isinstance(result, str):
                    output_path.write_text(result)
                elif isinstance(result, bytes):
                    output_path.write_bytes(result)
                elif isinstance(result, dict):
                    output_path.with_suffix('.json').write_text(
                        json.dumps(result, indent=2))

                self.stats['processed'] += 1
            except Exception as e:
                self.stats['errors'] += 1
                shutil.copy2(filepath, self.error_dir / filepath.name)
                error_log = self.error_dir / f"{filepath.stem}_error.txt"
                error_log.write_text(f"{datetime.now()}: {str(e)}")

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(process_file, files)

        print(f"Results: {self.stats}")
        return self.stats

# Usage
# processor = BatchProcessor("./raw_data", "./processed")
# processor.process("*.csv", lambda f: convert_csv_to_json(f))

Configuration

Parameter	Description	Default
`encoding`	Default text encoding	`"utf-8"`
`max_file_size`	Maximum allowed file size	`100MB`
`chunk_size`	Streaming read chunk size	`8192` bytes
`hash_algorithm`	Checksum algorithm	`"sha256"`
`detect_mime`	Use python-magic for MIME detection	`true`
`max_workers`	Parallel processing threads	`4`
`validate_extension`	Verify content matches extension	`true`
`backup_originals`	Keep original files before conversion	`true`

Best Practices

Validate file content, not just extensions — File extensions can be spoofed. Use python-magic to detect the actual MIME type from file headers: magic.from_file(path, mime=True). This prevents processing a renamed executable as an innocent CSV file.
Stream large files instead of loading into memory — Use generators and chunked reading for files over 100MB. for chunk in iter(lambda: f.read(8192), b'') reads in 8KB blocks. Pandas offers chunksize parameter for CSV processing: pd.read_csv(path, chunksize=10000).
Always specify encoding explicitly — Default encoding varies by OS (UTF-8 on Linux/Mac, cp1252 on Windows). Always pass encoding='utf-8' to open(). For unknown encodings, detect with chardet.detect() before reading.
Use pathlib.Path over os.path for modern file operations — Path provides cleaner syntax: path.suffix, path.stem, path.parent, path.glob('**/*.csv'). It handles OS-specific path separators automatically and supports method chaining.
Compute checksums for data integrity verification — Calculate SHA-256 hashes before and after file operations to verify integrity. Store hashes alongside files in a manifest. This catches corruption during transfer, storage, or processing.

Common Issues

UnicodeDecodeError when reading text files — The file uses a different encoding than expected. Try chardet for detection: import chardet; chardet.detect(open(path, 'rb').read()). Common encodings: UTF-8, Latin-1, cp1252. Use errors='replace' as a last resort to substitute undecodable bytes.

File locking conflicts on Windows — Windows doesn't allow reading files that are open for writing by another process. Use contextlib.contextmanager to ensure files are properly closed, or use tempfile.NamedTemporaryFile(delete=False) for atomic writes.

Memory error processing large files — Loading a multi-GB file with json.load() or pd.read_csv() exhausts RAM. Use streaming parsers: ijson for JSON, csv.reader with row-by-row processing, or pandas.read_csv(chunksize=N) for chunked DataFrame processing.

⚠️ Loading Issue

Ultimate File Framework

Ultimate File Framework

When to Use This Skill

Quick Start

Core Concepts

Format Comparison

Batch File Processing Pipeline

Configuration

Best Practices

Common Issues

Reviews

Write a review

Similar Templates

Full-Stack Code Reviewer

Test Suite Generator

Pro Architecture Workspace