# finding entropy in binary files

Update: added routine to print out hex data for blocks where entropy passes a given threshold.
Update: GUI display of graph (using TK) was not working on my system. Code now saves an image of the results in addition to attempting to display with GUI.
Update: These days I use a slightly modified libdisorder and gnuplot with tropy /bin/ls | gnuplot -e ‘plot “-” using 2:4;’. See submitted patch for details. Regardless, perhaps someone will still find the code from this article useful.
Update: See github for code and target examples
Update: You can also just use binwalk –entropy

Ero Carrera responded yesterday to a request on OpenRCE concerning using entropy analysis to find RSA keys and other random blocks of data in binaries. Here in is a full wrapper for the code he gives. We use matplotlib instead of Mathematica to generate the graph. Also if you plan to scan files larger than 100k I’d highly recommend downloading the modified progressBar class included here.

example output:

Target data:

``````data = ''.join (
[chr (random.randint (0, 64)) for x in xrange (1024)] +
[chr (random.randint (0, 255)) for x in xrange (1024)] +
[chr (random.randint (0, 64)) for x in xrange (1024)] )`````` ``````[==================================100%=====================================]
949 7.00: 1a060113050c2d0d 17302e091d2d0117 →♠☺‼♣♀- ↨0. ↔-☺↨``````

entropy_graph.py

``````""" Entropy scan
H() and entropy_scan() originally by Ero Carrera (blog.dkbza.org)

Modified May 2007 by cyphunk (deadhacker.com)
Modified Dec 2009 by cyphunk

USAGE:
cmd [target_path]
"""

# FLAGS:
SHOWPROGRESS = 1       # Show console progress bar?
PRINTONTHRESHOLD = 6.8 # When block is > than threshold
# print first 16 bytes in both
# hex and ascii.  Set to 0 to turn
# off.
ONLYFIRSTBLOCK = 0     # Set to 1 it will only print the first
# block that goes over threshold and not
# blocks > threshold that are only offset
# by 1.  By setting to zero block windows
# that match will be printed.
BLOCKSIZE = 256        # size of blocks scanned.

import math
import random
from pylab import *
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import tkFileDialog
from Tkinter import *
from progressBar import *
from binascii import hexlify
import string
import os
import cPickle # cache results

def H(data):
if not data:
return 0
entropy = 0
for x in range(256):
p_x = float(data.count(chr(x)))/len(data)
if p_x > 0:
entropy += - p_x*math.log(p_x, 2)
return entropy

def entropy_scan (data, block_size) :
if SHOWPROGRESS:
progress = progressBar(0, len(data) - block_size, 77)
# creates blocks of block_size for all possible offsets ('x'):
blocks = (data[x : block_size + x] for x in range (len (data) - block_size))
i = 0
for block in (blocks) :
i += 1
if SHOWPROGRESS:
progress(i)
yield H (block)

# performance improvement if you have psyco
try:
import psyco
psyco.full()
print "got psyco"
except ImportError:
pass

# get target file as argument var or from dialog:
filename = ""
if sys.argv[1:]:
filename = sys.argv
else:
root = Tk()
root.withdraw()
filetypes=[("All files", "*")])

# run, print graph:

if filename:
# Open file and scan for entropy:
if os.path.splitext(filename) == ".entropy":
print "File is a cached '.entropy' from previous scan"
filename = os.path.splitext(filename)
print filenamea
else:
# debug with test data:
"""
import random
raw = ''.join (
[chr (random.randint (0, 64)) for x in xrange (1024)] +
[chr (random.randint (0, 255)) for x in xrange (1024)] +
[chr (random.randint (0, 64)) for x in xrange (1024)] )
"""
results = list( entropy_scan(raw,BLOCKSIZE) )
print "saving cache of entropy scan data to %s" % filename+".entropy"
cPickle.dump(results, open(filename+".entropy", 'wb'))

# Print blocks that are above a defined threshold of entropy:
if PRINTONTHRESHOLD > 0:
print
found = 0
for i in range(len(results)):
if results[i] > PRINTONTHRESHOLD:
if found == 0:
table = string.maketrans("rnt", '   ') # don't like newlines
#blockstr = string.translate(str(raw[i : i+16]), table) # translate to string value
print "0x%8x %.2f: %s %s" % (i, results[i], hexlify(raw[i : i+8]),
hexlify(raw[i+8 : i+16]))
#%.3f - %016X / %s" % (i, results[i], raw[i : i + 16], raw[i : i + 16])
found = ONLYFIRSTBLOCK
else:
found = 0

# Plot
filesize = os.path.getsize(filename)
imgdpi = 100
imgwidth = filesize / imgdpi

if imgwidth > 327:
imgwidth = 327

majorLocator   = MultipleLocator(0x400)   # mark every 1024 bytes
majorFormatter = FormatStrFormatter('%X') # change to %d to see decimal offsets

ax = subplot(111)
plot(results, linewidth=2.0, antialiased=False)

ax.axis([0,filesize,0,8])
ax.xaxis.set_major_locator(majorLocator)
ax.xaxis.set_major_formatter(majorFormatter)
xticks(rotation=315)

xlabel('block (byte offset)')
ylabel('entropy')
title('Entropy levels')

grid(True)

img = gcf()
img.set_size_inches(imgwidth, 6)
img.savefig(filename+".png", dpi=imgdpi)

draw()
show()
``````

progressBar.py (originally from active state but modified for our use)

``````import sys

class progressBar:
""" Creates a text-based progress bar. Call the object with the `print'
command to see the progress bar, which looks something like this:

[=======>        22%                  ]

You may specify the progress bar's width, min and max values on init.
"""
def __init__(self, minValue = 0, maxValue = 100, totalWidth=80):
self.progBar = "[]"   # This holds the progress bar string
self.min = minValue
self.max = maxValue
self.span = maxValue - minValue
self.width = totalWidth
self.amount = 0       # When amount == max, we are 100% done
self.updateAmount(0)  # Build progress bar string
self._old_pbar = ""   # used to track change
self.pbar_str = ""

def updateAmount(self, newAmount = 0):
""" Update the progress bar with the new amount (with min and max
values set at initialization; if it is over or under, it takes the
min or max value as a default. """
if newAmount > self.max: newAmount = self.max
self.amount = newAmount

# Figure out the new percent done, round to an integer
diffFromMin = float(self.amount - self.min)
percentDone = (diffFromMin / float(self.span)) * 100.0
percentDone = int(round(percentDone))

# Figure out how many hash bars the percentage should be
allFull = self.width - 2
numHashes = (percentDone / 100.0) * allFull
numHashes = int(round(numHashes))

# Build a progress bar with an arrow of equal signs; special cases for
# empty and full
if numHashes == 0:
self.progBar = "[>%s]" % (' '*(allFull-1))
elif numHashes == allFull:
self.progBar = "[%s]\n" % ('='*allFull)
else:
self.progBar = "[%s>%s]" % ('='*(numHashes-1),
' '*(allFull-numHashes))

# figure out where to put the percentage, roughly centered
percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
percentString = str(percentDone) + "%"

# slice the percentage into the bar
self.progBar = ''.join([self.progBar[0:percentPlace], percentString,
self.progBar[percentPlace+len(percentString):]
])

def __str__(self):
return str(self.progBar)

def __call__(self, value):
""" Updates the amount, and writes to stdout. Prints a carriage return
first, so it will overwrite the current line in stdout."""

self.updateAmount(value)
self.pbar_str = str(self)
if self.pbar_str != self._old_pbar:
self._old_pbar = self.pbar_str
sys.stdout.write(self.pbar_str + "\r")
sys.stdout.flush()
``````