finding entropy in binary files

Update: added routine to print out hex data for blocks where entropy passes a given threshold. Tidied up code.

Ero Carrera responded yesterday to a request on OpenRCE concerning using entropy analysis to find RSA keys and other random blocks of data in binaries. Here in is a full wrapper for the code he gives. We use matplotlib instead of Mathematica to generate the graph. Also if you plan to scan files larger than 100k I’d highly recommend downloading the modified progressBar class included here.

example output:

Target data:

data = ''.join (
  [chr (random.randint (0, 64)) for x in xrange (1024)] +
  [chr (random.randint (0, 255)) for x in xrange (1024)] +
  [chr (random.randint (0, 64)) for x in xrange (1024)] )

example.png

[==================================100%=====================================]
     949 7.00: 1a060113050c2d0d 17302e091d2d0117 →♠☺‼♣♀- ↨0. ↔-☺↨

entropy_graph.py

""" Entropy scan
    H() and entropy_scan() originally by Ero Carrera (blog.dkbza.org)
    Modified May 2007 by cyphunk (deadhacker.com)

    USAGE:
    cmd [target_path]
    """

# FLAGS:
SHOWPROGRESS = 1       # Show console progress bar?
PRINTONTHRESHOLD = 7 # When block is > than threshold
                       # print first 16 bytes in both
                       # hex and ascii.  Set to 0 to turn
                       # off.
ONLYFIRSTBLOCK = 1     # Set to 1 it will only print the first
                       # block that goes over threshold and not
                       # blocks > threshold that are only offset
                       # by 1.  By setting to zero block windows
                       # that match will be printed.
BLOCKSIZE = 256        # size of blocks scanned.

import math
import random
from pylab import *
import tkFileDialog
from Tkinter import *
from progressBar import *
from binascii import hexlify

def H(data):
  if not data:
    return 0
  entropy = 0
  for x in range(256):
    p_x = float(data.count(chr(x)))/len(data)
    if p_x > 0:
      entropy += - p_x*math.log(p_x, 2)
  return entropy

def entropy_scan (data, block_size) :
  if SHOWPROGRESS:
      progress = progressBar(0, len(data) - block_size, 77)
  # creates blocks of block_size for all possible offsets ('x'):
  blocks = (data[x : block_size + x] for x in range (len (data) - block_size))
  i = 0
  for block in (blocks) :
    i += 1
    if SHOWPROGRESS:
        progress(i)
    yield H (block)

# get target file as argument var or from dialog:
filename = ""
if sys.argv[1:]:
    filename = sys.argv[1]
else:
    root = Tk()
    root.withdraw()
    filename = tkFileDialog.askopenfilename(title="Target binary",
                                        filetypes=[("All files", "*")])

# run, print graph:

if filename:
    # Open and scan for entropy:
    data = open(filename, 'rb')
    raw = data.read()
    results = list( entropy_scan(raw,BLOCKSIZE) )

    # Print blocks that are above a defined threshold of entropy:
    if PRINTONTHRESHOLD > 0:
        print
        found = 0
        for i in range(len(results)):
            if results[i] > PRINTONTHRESHOLD:
                if found == 0:
                    table = string.maketrans("rnt", '   ') # don't like newlines
                    blockstr = string.translate(str(raw[i : i+16]), table)
                    print "%8d %.2f: %s %s %s" % (i, results[i], hexlify(raw[i : i+8]),
                                                     hexlify(raw[i+8 : i+16]), blockstr)
                    #%.3f - %016X / %s" % (i, results[i], raw[i : i + 16], raw[i : i + 16])
                    found = ONLYFIRSTBLOCK
            else:
                found = 0

    # Plot
    plot(results)
    xlabel('block')
    ylabel('entropy')
    title('Entropy levels')
    grid(True)
    show()

progressBar.py (originally from active state but modified for our use)

import sys

class progressBar:
    """ Creates a text-based progress bar. Call the object with the `print'
        command to see the progress bar, which looks something like this:

        [=======>        22%                  ]

        You may specify the progress bar's width, min and max values on init.
    """
    def __init__(self, minValue = 0, maxValue = 100, totalWidth=80):
        self.progBar = "[]"   # This holds the progress bar string
        self.min = minValue
        self.max = maxValue
        self.span = maxValue - minValue
        self.width = totalWidth
        self.amount = 0       # When amount == max, we are 100% done
        self.updateAmount(0)  # Build progress bar string
        self._old_pbar = ""   # used to track change
        self.pbar_str = ""

    def updateAmount(self, newAmount = 0):
        """ Update the progress bar with the new amount (with min and max
            values set at initialization; if it is over or under, it takes the
            min or max value as a default. """
        if newAmount > self.max: newAmount = self.max
        self.amount = newAmount

        # Figure out the new percent done, round to an integer
        diffFromMin = float(self.amount - self.min)
        percentDone = (diffFromMin / float(self.span)) * 100.0
        percentDone = int(round(percentDone))

        # Figure out how many hash bars the percentage should be
        allFull = self.width - 2
        numHashes = (percentDone / 100.0) * allFull
        numHashes = int(round(numHashes))

        # Build a progress bar with an arrow of equal signs; special cases for
        # empty and full
        if numHashes == 0:
            self.progBar = "[>%s]" % (' '*(allFull-1))
        elif numHashes == allFull:
            self.progBar = "[%s]" % ('='*allFull)
        else:
            self.progBar = "[%s>%s]" % ('='*(numHashes-1),
                                        ' '*(allFull-numHashes))

        # figure out where to put the percentage, roughly centered
        percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
        percentString = str(percentDone) + "%"

        # slice the percentage into the bar
        self.progBar = ''.join([self.progBar[0:percentPlace], percentString,
                                self.progBar[percentPlace+len(percentString):]
                                ])

    def __str__(self):
        return str(self.progBar)

    def __call__(self, value):
        """ Updates the amount, and writes to stdout. Prints a carriage return
            first, so it will overwrite the current line in stdout."""

        self.updateAmount(value)
        self.pbar_str = str(self)
        if self.pbar_str != self._old_pbar:
            self._old_pbar = self.pbar_str
            sys.stdout.write(self.pbar_str + 'r')
            sys.stdout.flush()

About this entry