#!/usr/bin/env python
#
# Copyright (C) 2010 Philipp Winter
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# ----------------------------------------------------------------------
# This script determines and plots the information density (a.k.a. Shannon
# Entropy) of small and continuous chunks of a given file. This is done by
# using a sliding window which is iterated over the entire file's content.
#
# You will need the optparse and the pprocess module.
# Contact: pwr@7c0.org

import sys, math, optparse, pprocess

# when using a dynamically determined sliding window (default), each file is
# divided into a certain amount of measurement points as defined below
MEASUREMENT_POINTS = 100


def H( chunk ):
	"""Implementation of the Shannon Entropy."""

	if not chunk:
		return 0

	entropy = 0.0
	for char in range(256):
		p = float(chunk.count(chr(char))) / len(chunk)
		if p > 0: entropy += p * math.log(p, 2)

	# return normalized entropy ranging from 0 to 1
	return -entropy / 8


def analyzeData( entry ):
	"""Determines and plots the given file's information density by using a
	sliding window which iterates over the file content."""

	import matplotlib.pyplot as pyplot

	entropy = []
	offset = []
	(data, fileName) = entry

	# determine window size for sliding window
	if options.winSize != 0:
		winSize = options.winSize
	else:
		winSize = int(len(data) / MEASUREMENT_POINTS)

	for i in xrange(0, len(data), winSize/2):
		entropy.append(H(data[i:winSize+i]))
		offset.append(i + winSize/2)

	pyplot.plot(offset, entropy)
	pyplot.grid(True)
	pyplot.xlabel("file offset (bytes)")
	pyplot.ylabel("information density")
	pyplot.title(fileName + " (window size = %s)" % winSize)
	pyplot.show()


if __name__ == '__main__':

	cmdParser = optparse.OptionParser("%s [options] file(s)" % sys.argv[0])
	cmdParser.add_option("-w", "--window-size",
		dest="winSize",
		default=0,
		type="int",
		help="specify window size manually")
	(options, args) = cmdParser.parse_args()

	if len(args) == 0:
		cmdParser.error("incorrect number of arguments specified")

	dataChunks = [(open(fileName).read(), fileName) for fileName in args]
	pprocess.pmap(analyzeData, dataChunks)

