I wrote a script with utilities for calculating the entropy of iterables and included a Tk GUI that shows a quick overview over a text's properties in real-time. (on GitHub)
I tried to follow PEP 8 as good as possible, but I'm not sure about other things, specificially:
- I think my docstrings are sometimes overly redundant, see the GUI for example.
- In
gui.py, I'm not sure if I should move thecalculatemethod out of theGUIclass. - Is the overall design good? I know it's a rather small project, but I want to do this correctly.
If you have any other concerns beside these questions, I'm open to criticism!
The code is split into two modules:
calc.py - Includes the calculation functions
"""Utilities for entropy-related calculations."""
from math import ceil as _ceil, log2 as _log2
def prob_to_info(probability):
"""Converts probability in the range from 0 to 1 into information measured
in bits, therefore using the dual logarithm. Returns None if the probability
is equal to zero."""
if probability == 0:
return None
elif probability == 1:
return 0
else:
return -_log2(probability)
def info_to_prob(information):
"""Converts information measured in bits to probablity."""
return 2**-information
def entropy(iterable):
"""Calculates the Shannon entropy of the given iterable."""
return sum(prob[1]*prob_to_info(prob[1]) for prob in char_mapping(iterable))
def optimal_bits(iterable):
"""Calculates the optimal usage of bits for decoding the iterable."""
return _ceil(entropy(iterable)) * len(iterable)
def metric_entropy(iterable):
"""Calculates the metric entropy of the iterable."""
return entropy(iterable) / len(iterable)
def char_mapping(iterable):
"""Creates a dictionary of the unique chararacters and their probability
in the given iterable."""
char_map = dict.fromkeys(set(iterable))
for char in set(iterable):
probability = iterable.count(char) / len(iterable)
char_map[char] = probability
return sorted(char_map.items(), key=lambda x: x[1], reverse=True)
gui.py
import tkinter as tk
import calc
class GUI:
"""A simple Tk-based interface for real-time entropy-related analytics
on given texts."""
def __init__(self, root):
"""Initializes the GUI where 'root' is a tkinter.Tk instance."""
self.parent = root
self.parent.state("zoomed")
self.frame = tk.Frame(self.parent)
self.frame.grid(row=0, column=0, sticky="nwes")
self.input_head = tk.Label(self.frame, text="Input:")
self.input_head.grid(row=0, column=0, sticky="nwes")
self.ignore_case_value = tk.IntVar()
self.ignore_case_value.trace("w", self.case_switch)
self.ignore_case = tk.Checkbutton(
self.frame,
variable=self.ignore_case_value,
text="Ignore case"
)
self.ignore_case.grid(row=0, column=1, sticky="nwes")
self.input_main = tk.Text(self.frame)
self.input_main.grid(row=1, column=0, sticky="nwes", columnspan=2)
self.input_main.bind("<KeyRelease>", self.update)
self.output_head = tk.Label(self.frame, text="Output:")
self.output_head.grid(row=0, column=2, sticky="nwes")
self.output_main = tk.Text(self.frame, state=tk.DISABLED)
self.output_main.grid(row=1, column=2, sticky="nwes")
self.parent.rowconfigure(0, weight=1)
self.parent.columnconfigure(0, weight=1)
self.frame.rowconfigure(1, weight=1)
self.frame.columnconfigure(0, weight=1)
self.frame.columnconfigure(1, weight=1)
self.frame.columnconfigure(2, weight=1)
def case_switch(self, *_):
"""Toggles case sensivity ."""
self.input_main.edit_modified(True)
self.update()
def update(self, *_):
"""Updates the contents of the analysis text box."""
if not self.input_main.edit_modified():
return
analyze_text = self.calculate()
self.output_main["state"] = tk.NORMAL
self.output_main.delete("1.0", tk.END)
self.output_main.insert("1.0", analyze_text)
self.output_main["state"] = tk.DISABLED
self.input_main.edit_modified(False)
def calculate(self, *_):
"""Creates the analysis text."""
text = self.input_main.get("1.0", "end-1c")
if self.ignore_case_value.get():
text = text.lower()
char_map = calc.char_mapping(text)
entropy = calc.entropy(char_map)
metric_entropy = calc.metric_entropy(text)
optimal = calc.optimal_bits(text)
info = "\n".join(
[
"Length: " + str(len(text)),
"Unique chars: " + str(len(char_map)),
"Entropy: " + str(entropy),
"Metric entropy: " + str(metric_entropy),
"Optimal bit usage: " + str(optimal)
]
)
table_head = " Char\t| Probability\t\t| Bits\t\t| Occurences"
table_body = "\n".join(
[
" " + repr(char)[1:-1] +
"\t" + str(round(prob, 7)) +
"\t\t" + str(round(calc.prob_to_info(prob), 7)) +
"\t\t" + str(text.count(char))
for char, prob in char_map
]
)
table = "\n".join([table_head, table_body])
return "\n\n".join([info, table])
def main():
root = tk.Tk()
_ = GUI(root)
root.mainloop()
if __name__ == "__main__":
main()
ceilandlog2? \$\endgroup\$