#!/usr/bin/env python3 """Baseline-subtraction noise filter for SAND captures. Idea: capture a short "noise baseline" with SAND NOT running, then capture the real session with SAND running. Every IP/host in the baseline is pre-SAND noise; subtract it and what's left is (almost entirely) the game's traffic. Usage: # 1) just list the noise in a baseline: venv/bin/python reverse/noise_filter.py baseline.pcapng # 2) baseline + session -> hosts unique to the session + a ready Wireshark filter: venv/bin/python reverse/noise_filter.py baseline.pcapng session.pcapng Outputs, for the session run: - the new (non-noise) IPs/hosts, sorted by traffic volume - a Wireshark *display* filter: ip.addr==X or ip.addr==Y ... - a *capture* (BPF) filter to EXCLUDE noise next time: not (host X or host Y ...) """ import sys from collections import defaultdict from scapy.all import rdpcap, DNS, DNSQR, IP, IPv6, TCP, UDP, Raw sys.path.insert(0, __file__.rsplit("/", 1)[0]) from capture_hosts import tls_sni # reuse the SNI parser # hosts/IPs that are never the game, even if they appear only in the session ALWAYS_NOISE_SUBSTR = ("anthropic.com", "datadoghq.com", "windowsupdate", "msftncsi", "msftconnecttest", "ntp.", ".pool.ntp.org") def scan(path): """Return (ip_volume, ip2host) for a pcap. ip_volume[ip] = packet count to/from that remote ip; ip2host[ip] = best label.""" pk = rdpcap(path) vol = defaultdict(int) ip2host, dns = {}, {} # learn DNS answers (qname for an ip) and SNI for p in pk: if p.haslayer(DNS) and p[DNS].qr == 1 and p[DNS].ancount: try: qn = p[DNSQR].qname.decode(errors="replace").rstrip(".") for k in range(p[DNS].ancount): rr = p[DNS].an[k] if rr.type in (1, 28): dns[str(rr.rdata)] = qn except Exception: pass if p.haslayer(TCP) and p.haslayer(Raw): s = tls_sni(bytes(p[Raw].load)) if s and (p.haslayer(IP) or p.haslayer(IPv6)): ipl = p[IP] if p.haslayer(IP) else p[IPv6] ip2host[ipl.dst] = s ipl = p[IP] if p.haslayer(IP) else (p[IPv6] if p.haslayer(IPv6) else None) if ipl is None: continue for ip in (ipl.src, ipl.dst): if not is_local(ip): vol[ip] += 1 for ip in vol: ip2host.setdefault(ip, dns.get(ip, "")) return vol, ip2host def is_local(ip): return (ip.startswith(("10.", "192.168.", "127.", "169.254.", "fe80:", "ff", "::1")) or ip.startswith("172.") and 16 <= int(ip.split(".")[1] or 0) <= 31 or ip in ("0.0.0.0",) or ip.endswith(".255")) def main(): if len(sys.argv) < 2: sys.exit(__doc__) base_vol, base_host = scan(sys.argv[1]) noise = set(base_vol) print("=== baseline noise: %d remote IPs ===" % len(noise)) for ip, n in sorted(base_vol.items(), key=lambda x: -x[1]): print(" %-16s %-6d %s" % (ip, n, base_host.get(ip, ""))) if len(sys.argv) < 3: print("\n(pass a second pcap to diff a real session against this baseline)") return sess_vol, sess_host = scan(sys.argv[2]) # a session ip is "game" if not in baseline and not on the always-noise list def always_noise(ip): h = sess_host.get(ip, "") return any(s in h for s in ALWAYS_NOISE_SUBSTR) new = {ip: n for ip, n in sess_vol.items() if ip not in noise and not always_noise(ip)} print("\n=== session-only hosts (candidate SAND backends) ===") for ip, n in sorted(new.items(), key=lambda x: -x[1]): print(" %-16s %-6d %s" % (ip, n, sess_host.get(ip, ""))) if not new: print(" (nothing new — either SAND made no new connections, or it reused a " "baseline IP/CDN; widen the gap or capture longer)") return ips = sorted(new) print("\n--- Wireshark DISPLAY filter (keep only SAND) ---") print(" " + " or ".join("ip.addr==%s" % ip for ip in ips)) print("\n--- Wireshark CAPTURE filter (BPF, EXCLUDE noise next time) ---") print(" not (" + " or ".join("host %s" % ip for ip in sorted(noise)) + ")") if __name__ == "__main__": main()