finding entropy in binary files

Update: added routine to print out hex data for blocks where entropy passes a given threshold.
Update: GUI display of graph (using TK) was not working on my system. Code now saves an image of the results in addition to attempting to display with GUI.
Update: These days I use a slightly modified libdisorder and gnuplot with tropy /bin/ls | gnuplot -e ‘plot “-” using 2:4;’. See submitted patch for details. Regardless, perhaps someone will still find the code from this article useful.
Update: See github for code and target examples
Update: You can also just use binwalk –entropy

Ero Carrera responded yesterday to a request on OpenRCE concerning using entropy analysis to find RSA keys and other random blocks of data in binaries. Here in is a full wrapper for the code he gives. We use matplotlib instead of Mathematica to generate the graph. Also if you plan to scan files larger than 100k I’d highly recommend downloading the modified progressBar class included here.

example output:

Target data:

data = ''.join (
  [chr (random.randint (0, 64)) for x in xrange (1024)] +
  [chr (random.randint (0, 255)) for x in xrange (1024)] +
  [chr (random.randint (0, 64)) for x in xrange (1024)] )

example.png

[==================================100%=====================================]
     949 7.00: 1a060113050c2d0d 17302e091d2d0117 →♠☺‼♣♀- ↨0. ↔-☺↨

entropy_graph.py

""" Entropy scan
    H() and entropy_scan() originally by Ero Carrera (blog.dkbza.org)

    Modified May 2007 by cyphunk (deadhacker.com)
    Modified Dec 2009 by cyphunk

    USAGE:
    cmd [target_path]
    """

# FLAGS:
SHOWPROGRESS = 1       # Show console progress bar?
PRINTONTHRESHOLD = 6.8 # When block is > than threshold
                       # print first 16 bytes in both
                       # hex and ascii.  Set to 0 to turn
                       # off.
ONLYFIRSTBLOCK = 0     # Set to 1 it will only print the first
                       # block that goes over threshold and not
                       # blocks > threshold that are only offset
                       # by 1.  By setting to zero block windows
                       # that match will be printed.
BLOCKSIZE = 256        # size of blocks scanned.

import math
import random
from pylab import *
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import tkFileDialog
from Tkinter import *
from progressBar import *
from binascii import hexlify
import string
import os
import cPickle # cache results

def H(data):
  if not data:
    return 0
  entropy = 0
  for x in range(256):
    p_x = float(data.count(chr(x)))/len(data)
    if p_x > 0:
      entropy += - p_x*math.log(p_x, 2)
  return entropy

def entropy_scan (data, block_size) :
  if SHOWPROGRESS:
      progress = progressBar(0, len(data) - block_size, 77)
  # creates blocks of block_size for all possible offsets ('x'):
  blocks = (data[x : block_size + x] for x in range (len (data) - block_size))
  i = 0
  for block in (blocks) :
    i += 1
    if SHOWPROGRESS:
        progress(i)
    yield H (block)

# performance improvement if you have psyco
try:
  import psyco
  psyco.full()
  print "got psyco"
except ImportError:
  pass

# get target file as argument var or from dialog:
filename = ""
if sys.argv[1:]:
    filename = sys.argv[1]
else:
    root = Tk()
    root.withdraw()
    filename = tkFileDialog.askopenfilename(title="Target binary",
                                        filetypes=[("All files", "*")])

# run, print graph:

if filename:
    # Open file and scan for entropy:
    if os.path.splitext(filename)[1] == ".entropy":
        print "File is a cached '.entropy' from previous scan"
        results = cPickle.load(open(filename, 'rb'))
        filename = os.path.splitext(filename)[0]
        print filenamea
        raw = open(filename, 'rb').read()
    else:
        raw = open(filename, 'rb').read()
        # debug with test data:
        """
        import random
        raw = ''.join (
        [chr (random.randint (0, 64)) for x in xrange (1024)] +
        [chr (random.randint (0, 255)) for x in xrange (1024)] +
        [chr (random.randint (0, 64)) for x in xrange (1024)] )
        """
        results = list( entropy_scan(raw,BLOCKSIZE) )
        print "saving cache of entropy scan data to %s" % filename+".entropy"
        cPickle.dump(results, open(filename+".entropy", 'wb')) 

    # Print blocks that are above a defined threshold of entropy:
    if PRINTONTHRESHOLD > 0:
        print
        found = 0
        for i in range(len(results)):
            if results[i] > PRINTONTHRESHOLD:
                if found == 0:
                    table = string.maketrans("rnt", '   ') # don't like newlines
                    #blockstr = string.translate(str(raw[i : i+16]), table) # translate to string value
                    print "0x%8x %.2f: %s %s" % (i, results[i], hexlify(raw[i : i+8]),
                                                     hexlify(raw[i+8 : i+16]))
                    #%.3f - %016X / %s" % (i, results[i], raw[i : i + 16], raw[i : i + 16])
                    found = ONLYFIRSTBLOCK
            else:
                found = 0

    # Plot
    filesize = os.path.getsize(filename)
    imgdpi = 100
    imgwidth = filesize / imgdpi

    if imgwidth > 327:
      imgwidth = 327

    majorLocator   = MultipleLocator(0x400)   # mark every 1024 bytes
    majorFormatter = FormatStrFormatter('%X') # change to %d to see decimal offsets

    ax = subplot(111)
    plot(results, linewidth=2.0, antialiased=False)
    subplots_adjust(left=0.02, right=0.99, bottom=0.2)

    ax.axis([0,filesize,0,8])
    ax.xaxis.set_major_locator(majorLocator)
    ax.xaxis.set_major_formatter(majorFormatter)
    xticks(rotation=315)

    xlabel('block (byte offset)')
    ylabel('entropy')
    title('Entropy levels')

    grid(True)

    img = gcf()
    img.set_size_inches(imgwidth, 6)
    img.savefig(filename+".png", dpi=imgdpi)

    draw()
    show()

progressBar.py (originally from active state but modified for our use)

import sys

class progressBar:
    """ Creates a text-based progress bar. Call the object with the `print'
        command to see the progress bar, which looks something like this:

        [=======>        22%                  ]

        You may specify the progress bar's width, min and max values on init.
    """
    def __init__(self, minValue = 0, maxValue = 100, totalWidth=80):
        self.progBar = "[]"   # This holds the progress bar string
        self.min = minValue
        self.max = maxValue
        self.span = maxValue - minValue
        self.width = totalWidth
        self.amount = 0       # When amount == max, we are 100% done
        self.updateAmount(0)  # Build progress bar string
        self._old_pbar = ""   # used to track change
        self.pbar_str = ""

    def updateAmount(self, newAmount = 0):
        """ Update the progress bar with the new amount (with min and max
            values set at initialization; if it is over or under, it takes the
            min or max value as a default. """
        if newAmount > self.max: newAmount = self.max
        self.amount = newAmount

        # Figure out the new percent done, round to an integer
        diffFromMin = float(self.amount - self.min)
        percentDone = (diffFromMin / float(self.span)) * 100.0
        percentDone = int(round(percentDone))

        # Figure out how many hash bars the percentage should be
        allFull = self.width - 2
        numHashes = (percentDone / 100.0) * allFull
        numHashes = int(round(numHashes))

        # Build a progress bar with an arrow of equal signs; special cases for
        # empty and full
        if numHashes == 0:
            self.progBar = "[>%s]" % (' '*(allFull-1))
        elif numHashes == allFull:
            self.progBar = "[%s]\n" % ('='*allFull)
        else:
            self.progBar = "[%s>%s]" % ('='*(numHashes-1),
                                        ' '*(allFull-numHashes))

        # figure out where to put the percentage, roughly centered
        percentPlace = (len(self.progBar) / 2) - len(str(percentDone))
        percentString = str(percentDone) + "%"

        # slice the percentage into the bar
        self.progBar = ''.join([self.progBar[0:percentPlace], percentString,
                                self.progBar[percentPlace+len(percentString):]
                                ])

    def __str__(self):
        return str(self.progBar)

    def __call__(self, value):
        """ Updates the amount, and writes to stdout. Prints a carriage return
            first, so it will overwrite the current line in stdout."""

        self.updateAmount(value)
        self.pbar_str = str(self)
        if self.pbar_str != self._old_pbar:
            self._old_pbar = self.pbar_str
            sys.stdout.write(self.pbar_str + "\r")
            sys.stdout.flush()
Advertisements

About this entry