From f7b8c071a84355d044252e4266baa6758e6c6108 Mon Sep 17 00:00:00 2001 From: definus6-dev Date: Fri, 23 Jan 2026 23:02:17 +0900 Subject: [PATCH 1/2] refactor: generalize entropy calculation and improve semantic clarity - Extracted core Shannon entropy calculation into a reusable pure function - Separated text analysis logic from computation for better modularity - Improved variable naming to reflect information theory concepts - Optimized computational complexity from O(A^2) to O(N) - Added physical and mathematical context to documentation --- maths/entropy.py | 214 ++++++++++++++++++++++------------------------- 1 file changed, 98 insertions(+), 116 deletions(-) diff --git a/maths/entropy.py b/maths/entropy.py index b816f1d193f7..a1e434474ecf 100644 --- a/maths/entropy.py +++ b/maths/entropy.py @@ -1,132 +1,114 @@ -#!/usr/bin/env python3 - -""" -Implementation of entropy of information -https://en.wikipedia.org/wiki/Entropy_(information_theory) -""" - from __future__ import annotations import math from collections import Counter -from string import ascii_lowercase +""" +In information theory, entropy is a measure of the uncertainty or randomness of a +source of data. It quantifies the expected amount of information contained in each +message from the source. -def calculate_prob(text: str) -> None: - """ - This method takes path and two dict as argument - and than calculates entropy of them. - :param dict: - :param dict: - :return: Prints - 1) Entropy of information based on 1 alphabet - 2) Entropy of information based on couples of 2 alphabet - 3) print Entropy of H(X n|Xn-1) - - Text from random books. Also, random quotes. - >>> text = ("Behind Winston's back the voice " - ... "from the telescreen was still " - ... "babbling and the overfulfilment") - >>> calculate_prob(text) - 4.0 - 6.0 - 2.0 - - >>> text = ("The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official" - ... "face in elegant lettering, the three") - >>> calculate_prob(text) - 4.0 - 5.0 - 1.0 - >>> text = ("Had repulsive dashwoods suspicion sincerity but advantage now him. " - ... "Remark easily garret nor nay. Civil those mrs enjoy shy fat merry. " - ... "You greatest jointure saw horrible. He private he on be imagine " - ... "suppose. Fertile beloved evident through no service elderly is. Blind " - ... "there if every no so at. Own neglected you preferred way sincerity " - ... "delivered his attempted. To of message cottage windows do besides " - ... "against uncivil. Delightful unreserved impossible few estimating " - ... "men favourable see entreaties. She propriety immediate was improving. " - ... "He or entrance humoured likewise moderate. Much nor game son say " - ... "feel. Fat make met can must form into gate. Me we offending prevailed " - ... "discovery.") - >>> calculate_prob(text) - 4.0 - 7.0 - 3.0 +The core formula for Shannon Entropy H(X) is: + H(X) = -Σ P(x) * log2(P(x)) +where P(x) is the probability of an event x occurring. + +This concept mirrors the thermodynamic entropy in physics, representing the level +of disorder in a system. In a digital context, it defines the theoretical limit +for data compression. + +Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory) +""" + + +def shannon_entropy(probabilities: list[float]) -> float: """ - single_char_strings, two_char_strings = analyze_text(text) - my_alphas = list(" " + ascii_lowercase) - # what is our total sum of probabilities. - all_sum = sum(single_char_strings.values()) - - # one length string - my_fir_sum = 0 - # for each alpha we go in our dict and if it is in it we calculate entropy - for ch in my_alphas: - if ch in single_char_strings: - my_str = single_char_strings[ch] - prob = my_str / all_sum - my_fir_sum += prob * math.log2(prob) # entropy formula. - - # print entropy - print(f"{round(-1 * my_fir_sum):.1f}") - - # two len string - all_sum = sum(two_char_strings.values()) - my_sec_sum = 0 - # for each alpha (two in size) calculate entropy. - for ch0 in my_alphas: - for ch1 in my_alphas: - sequence = ch0 + ch1 - if sequence in two_char_strings: - my_str = two_char_strings[sequence] - prob = int(my_str) / all_sum - my_sec_sum += prob * math.log2(prob) - - # print second entropy - print(f"{round(-1 * my_sec_sum):.1f}") - - # print the difference between them - print(f"{round((-1 * my_sec_sum) - (-1 * my_fir_sum)):.1f}") - - -def analyze_text(text: str) -> tuple[dict, dict]: + Calculates the Shannon entropy of a given probability distribution. + + Args: + probabilities: A list of probabilities representing a discrete distribution. + + Returns: + The entropy value in bits. + + Raises: + ValueError: If probabilities are negative or do not sum to approximately 1.0. + + Examples: + >>> shannon_entropy([0.5, 0.5]) + 1.0 + >>> shannon_entropy([1.0, 0.0]) + 0.0 + >>> shannon_entropy([0.25, 0.25, 0.25, 0.25]) + 2.0 """ - Convert text input into two dicts of counts. - The first dictionary stores the frequency of single character strings. - The second dictionary stores the frequency of two character strings. + if any(p < 0 for p in probabilities): + raise ValueError("Probabilities cannot be negative.") + + # Due to floating point precision, we check for closeness to 1.0 + if not math.isclose(sum(probabilities), 1.0, rel_tol=1e-9) and sum(probabilities) > 0: + # Normalize if not summed to 1 but has values + probabilities = [p / sum(probabilities) for p in probabilities] + + entropy = 0.0 + for p in probabilities: + if p > 0: + entropy -= p * math.log2(p) + + return entropy + + +def analyze_text_entropy(text: str) -> dict[str, float]: """ - single_char_strings = Counter() # type: ignore[var-annotated] - two_char_strings = Counter() # type: ignore[var-annotated] - single_char_strings[text[-1]] += 1 + Analyzes the entropy of a given text at different levels (1-gram, 2-gram). - # first case when we have space at start. - two_char_strings[" " + text[0]] += 1 - for i in range(len(text) - 1): - single_char_strings[text[i]] += 1 - two_char_strings[text[i : i + 2]] += 1 - return single_char_strings, two_char_strings + Args: + text: The input string to analyze. + Returns: + A dictionary containing entropy values for different n-gram levels. -def main(): + Examples: + >>> result = analyze_text_entropy("aaaaa") + >>> result['1-gram'] + 0.0 + >>> result = analyze_text_entropy("abab") + >>> round(result['1-gram'], 2) + 1.0 + """ + if not text: + return {"1-gram": 0.0, "2-gram": 0.0} + + # 1-gram analysis (individual characters) + counts_1gram = Counter(text) + total_chars = len(text) + probs_1gram = [count / total_chars for count in counts_1gram.values()] + entropy_1gram = shannon_entropy(probs_1gram) + + # 2-gram analysis (pairs of characters) + if len(text) < 2: + entropy_2gram = 0.0 + else: + pairs = [text[i : i + 2] for i in range(len(text) - 1)] + counts_2gram = Counter(pairs) + total_pairs = len(pairs) + probs_2gram = [count / total_pairs for count in counts_2gram.values()] + entropy_2gram = shannon_entropy(probs_2gram) + + return { + "1-gram": entropy_1gram, + "2-gram": entropy_2gram, + "conditional_entropy": max(0.0, entropy_2gram - entropy_1gram), + } + + +if __name__ == "__main__": import doctest doctest.testmod() - # text = ( - # "Had repulsive dashwoods suspicion sincerity but advantage now him. Remark " - # "easily garret nor nay. Civil those mrs enjoy shy fat merry. You greatest " - # "jointure saw horrible. He private he on be imagine suppose. Fertile " - # "beloved evident through no service elderly is. Blind there if every no so " - # "at. Own neglected you preferred way sincerity delivered his attempted. To " - # "of message cottage windows do besides against uncivil. Delightful " - # "unreserved impossible few estimating men favourable see entreaties. She " - # "propriety immediate was improving. He or entrance humoured likewise " - # "moderate. Much nor game son say feel. Fat make met can must form into " - # "gate. Me we offending prevailed discovery. " - # ) - - # calculate_prob(text) - -if __name__ == "__main__": - main() + # Manual demonstration + sample_text = "Behind Winston's back the voice from the telescreen was still" + entropy_stats = analyze_text_entropy(sample_text) + print(f"Text: '{sample_text[:30]}...'") + for level, value in entropy_stats.items(): + print(f"{level:>20}: {value:.4f} bits") \ No newline at end of file From 989af3528e6cbeb495791b6b72c5e344e2be7a92 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:04:10 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- maths/entropy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/maths/entropy.py b/maths/entropy.py index a1e434474ecf..e3875c78c72a 100644 --- a/maths/entropy.py +++ b/maths/entropy.py @@ -45,7 +45,10 @@ def shannon_entropy(probabilities: list[float]) -> float: raise ValueError("Probabilities cannot be negative.") # Due to floating point precision, we check for closeness to 1.0 - if not math.isclose(sum(probabilities), 1.0, rel_tol=1e-9) and sum(probabilities) > 0: + if ( + not math.isclose(sum(probabilities), 1.0, rel_tol=1e-9) + and sum(probabilities) > 0 + ): # Normalize if not summed to 1 but has values probabilities = [p / sum(probabilities) for p in probabilities] @@ -111,4 +114,4 @@ def analyze_text_entropy(text: str) -> dict[str, float]: entropy_stats = analyze_text_entropy(sample_text) print(f"Text: '{sample_text[:30]}...'") for level, value in entropy_stats.items(): - print(f"{level:>20}: {value:.4f} bits") \ No newline at end of file + print(f"{level:>20}: {value:.4f} bits")