| name | frequency-analysis |
| description | Perform letter frequency analysis to crack substitution ciphers. Use this skill when analyzing ciphertext patterns, identifying cipher types, or breaking monoalphabetic substitution ciphers using statistical methods. |
Frequency Analysis
Analyze letter and pattern frequencies to break classical ciphers.
Letter Frequency Analysis
from collections import Counter
import string
# English letter frequencies (approximate)
ENGLISH_FREQ = {
'E': 12.7, 'T': 9.1, 'A': 8.2, 'O': 7.5, 'I': 7.0,
'N': 6.7, 'S': 6.3, 'H': 6.1, 'R': 6.0, 'D': 4.3,
'L': 4.0, 'C': 2.8, 'U': 2.8, 'M': 2.4, 'W': 2.4,
'F': 2.2, 'G': 2.0, 'Y': 2.0, 'P': 1.9, 'B': 1.5,
'V': 1.0, 'K': 0.8, 'J': 0.15, 'X': 0.15, 'Q': 0.1,
'Z': 0.07
}
def analyze_frequency(text: str) -> dict:
"""Calculate letter frequency percentages."""
letters = [c.upper() for c in text if c.isalpha()]
total = len(letters)
if total == 0:
return {}
counts = Counter(letters)
return {char: (count / total) * 100 for char, count in counts.items()}
def compare_to_english(freq: dict) -> float:
"""Calculate chi-squared distance from English."""
chi_sq = 0
for letter in string.ascii_uppercase:
observed = freq.get(letter, 0)
expected = ENGLISH_FREQ.get(letter, 0)
if expected > 0:
chi_sq += ((observed - expected) ** 2) / expected
return chi_sq
# Example usage
ciphertext = "XLMW MW E WIGVIX QIWWEKI"
freq = analyze_frequency(ciphertext)
for letter, pct in sorted(freq.items(), key=lambda x: -x[1]):
print(f"{letter}: {pct:.1f}%")
Bigram and Trigram Analysis
def analyze_ngrams(text: str, n: int = 2) -> Counter:
"""Count n-grams in text."""
text = ''.join(c.upper() for c in text if c.isalpha())
ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
return Counter(ngrams)
# Common English bigrams and trigrams
COMMON_BIGRAMS = ['TH', 'HE', 'IN', 'ER', 'AN', 'RE', 'ON', 'AT', 'EN', 'ND']
COMMON_TRIGRAMS = ['THE', 'AND', 'ING', 'HER', 'HAT', 'HIS', 'THA', 'ERE', 'FOR', 'ENT']
ciphertext = "XLMWMWEWIGVIXQIWWEKI"
bigrams = analyze_ngrams(ciphertext, 2)
trigrams = analyze_ngrams(ciphertext, 3)
print("Top bigrams:", bigrams.most_common(5))
print("Top trigrams:", trigrams.most_common(5))
Index of Coincidence
def index_of_coincidence(text: str) -> float:
"""Calculate IC to identify cipher type.
English text: ~0.067
Random text: ~0.038
"""
letters = [c.upper() for c in text if c.isalpha()]
n = len(letters)
if n <= 1:
return 0
freq = Counter(letters)
ic = sum(f * (f - 1) for f in freq.values()) / (n * (n - 1))
return ic
# Use IC to detect cipher type
ciphertext = "XLMWMWEWIGVIXQIWWEKI"
ic = index_of_coincidence(ciphertext)
print(f"Index of Coincidence: {ic:.4f}")
if ic > 0.060:
print("Likely monoalphabetic substitution")
elif ic > 0.045:
print("Likely polyalphabetic (Vigenere)")
else:
print("Likely random or very strong encryption")
Kasiski Examination (Vigenere Key Length)
import math
from collections import defaultdict
def find_repeated_sequences(ciphertext: str, min_len: int = 3) -> dict:
"""Find repeated sequences and their spacings."""
text = ''.join(c.upper() for c in ciphertext if c.isalpha())
sequences = defaultdict(list)
for seq_len in range(min_len, min(len(text) // 2, 10)):
for i in range(len(text) - seq_len):
seq = text[i:i + seq_len]
for j in range(i + seq_len, len(text) - seq_len + 1):
if text[j:j + seq_len] == seq:
sequences[seq].append(j - i)
return sequences
def kasiski_key_length(ciphertext: str) -> list:
"""Estimate Vigenere key length using Kasiski examination."""
sequences = find_repeated_sequences(ciphertext)
spacings = []
for seq, positions in sequences.items():
spacings.extend(positions)
if not spacings:
return []
# Find GCD of spacings
factors = defaultdict(int)
for spacing in spacings:
for i in range(2, min(spacing + 1, 20)):
if spacing % i == 0:
factors[i] += 1
return sorted(factors.items(), key=lambda x: -x[1])[:5]
# Example
ciphertext = "VVHQWVVRHMUSGJGTHKIHTSSEJCHLSFCBGVWCRLRYQTFSVGAHWKCUHWAUGLQHNSLRLJSHBLTSPISPRDXLJSVEEGHLQWKASSKUWEPWQTWVSPGOELKCQYFNSVWLJSNIQKGNRGYBWLWGOVHAKSHXBAXMZSSHXBHGSHTCHLSTWDDWW"
key_lengths = kasiski_key_length(ciphertext)
print("Likely key lengths:", key_lengths)
Pattern Word Matching
def get_word_pattern(word: str) -> str:
"""Get pattern for word matching.
HELLO -> 0.1.2.2.3
"""
word = word.upper()
mapping = {}
pattern = []
next_num = 0
for char in word:
if char not in mapping:
mapping[char] = next_num
next_num += 1
pattern.append(str(mapping[char]))
return '.'.join(pattern)
# Common English words by pattern
PATTERN_WORDS = {
'0.1.2': ['THE', 'AND', 'FOR', 'ARE', 'BUT'],
'0.1.1.2': ['THAT', 'WITH', 'HAVE', 'BEEN'],
'0': ['A', 'I'],
}
# Find possible decryptions for a pattern
cipher_word = "XYY"
pattern = get_word_pattern(cipher_word) # "0.1.1"
print(f"Pattern: {pattern}")
Automated Substitution Solver
def score_text(text: str) -> float:
"""Score text based on English frequency match."""
freq = analyze_frequency(text)
return -compare_to_english(freq) # Higher is better
def solve_substitution_hill_climbing(ciphertext: str, iterations: int = 10000):
"""Solve substitution cipher using hill climbing."""
import random
alphabet = list(string.ascii_uppercase)
best_key = alphabet.copy()
random.shuffle(best_key)
best_score = score_text(apply_key(ciphertext, best_key))
for _ in range(iterations):
# Swap two random letters
new_key = best_key.copy()
i, j = random.sample(range(26), 2)
new_key[i], new_key[j] = new_key[j], new_key[i]
plaintext = apply_key(ciphertext, new_key)
score = score_text(plaintext)
if score > best_score:
best_key = new_key
best_score = score
return ''.join(best_key), apply_key(ciphertext, best_key)
def apply_key(text: str, key: list) -> str:
"""Apply substitution key to text."""
table = str.maketrans(string.ascii_uppercase, ''.join(key))
table.update(str.maketrans(string.ascii_lowercase, ''.join(key).lower()))
return text.translate(table)