port some benchmarks

2020-08-11 17:17:16 +00:00 · 2020-08-11 17:17:16 +00:00 · 009f8b5fb0
commit 009f8b5fb0
parent 30a534e650
30 changed files with 5913 additions and 0 deletions
--- a/coremark/Makefile
+++ b/coremark/Makefile
@ -0,0 +1,3 @@
+NAME = coremark
+SRCS = $(shell find -L ./src/ -name "*.c")
+include $(AM_HOME)/Makefile
--- a/coremark/include/core_portme.h
+++ b/coremark/include/core_portme.h
@ -0,0 +1,188 @@
+/* Topic : Description
+	This file contains configuration constants required to execute on different platforms
+*/
+
+
+#ifndef CORE_PORTME_H
+#define CORE_PORTME_H
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+#define ITERATIONS 1000
+#define MEM_METHOD MEM_STATIC
+
+/************************/
+/* Data types and settings */
+/************************/
+/* Configuration : HAS_FLOAT
+	Define to 1 if the platform supports floating point.
+*/
+#ifndef HAS_FLOAT
+#define HAS_FLOAT 0
+#endif
+/* Configuration : HAS_TIME_H
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef HAS_TIME_H
+#define HAS_TIME_H 0
+#endif
+/* Configuration : USE_CLOCK
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef USE_CLOCK
+#define USE_CLOCK 0
+#endif
+/* Configuration : HAS_STDIO
+	Define to 1 if the platform has stdio.h.
+*/
+#ifndef HAS_STDIO
+#define HAS_STDIO 0
+#endif
+/* Configuration : HAS_PRINTF
+	Define to 1 if the platform has stdio.h and implements the printf function.
+*/
+#ifndef HAS_PRINTF
+#define HAS_PRINTF 1
+#endif
+
+/* Configuration : CORE_TICKS
+	Define type of return from the timing functions.
+ */
+typedef uint32_t CORE_TICKS;
+
+/* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
+	Initialize these strings per platform
+*/
+#ifndef COMPILER_VERSION
+ #ifdef __GNUC__
+ #define COMPILER_VERSION "GCC"__VERSION__
+ #else
+ #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
+ #endif
+#endif
+#ifndef COMPILER_FLAGS
+ #define COMPILER_FLAGS
+#endif
+#ifndef MEM_LOCATION
+ #define MEM_LOCATION "STACK"
+#endif
+
+/* Data Types :
+	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
+
+	*Imprtant* :
+	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
+*/
+typedef signed short ee_s16;
+typedef unsigned short ee_u16;
+typedef signed int ee_s32;
+typedef double ee_f32;
+typedef unsigned char ee_u8;
+typedef unsigned int ee_u32;
+typedef unsigned long ee_ptr_int;
+typedef size_t ee_size_t;
+/* align_mem :
+	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
+*/
+#define align_mem(x) (void *)(4 + (((unsigned long)(x) - 1) & ~3))
+
+/* Configuration : SEED_METHOD
+	Defines method to get seed values that cannot be computed at compile time.
+
+	Valid values :
+	SEED_ARG - from command line.
+	SEED_FUNC - from a system function.
+	SEED_VOLATILE - from volatile variables.
+*/
+#ifndef SEED_METHOD
+#define SEED_METHOD SEED_VOLATILE
+#endif
+
+/* Configuration : MEM_METHOD
+	Defines method to get a block of memry.
+
+	Valid values :
+	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+	MEM_STATIC - to use a static memory array.
+	MEM_STACK - to allocate the data block on the stack (NYI).
+*/
+#ifndef MEM_METHOD
+#define MEM_METHOD MEM_STACK
+#endif
+
+/* Configuration : MULTITHREAD
+	Define for parallel execution
+
+	Valid values :
+	1 - only one context (default).
+	N>1 - will execute N copies in parallel.
+
+	Note :
+	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
+
+	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
+
+	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
+	to fit a particular architecture.
+*/
+#ifndef MULTITHREAD
+#define MULTITHREAD 1
+#define USE_PTHREAD 0
+#define USE_FORK 0
+#define USE_SOCKET 0
+#endif
+
+/* Configuration : MAIN_HAS_NOARGC
+	Needed if platform does not support getting arguments to main.
+
+	Valid values :
+	0 - argc/argv to main is supported
+	1 - argc/argv to main is not supported
+
+	Note :
+	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
+*/
+#ifndef MAIN_HAS_NOARGC
+#define MAIN_HAS_NOARGC 0
+#endif
+
+/* Configuration : MAIN_HAS_NORETURN
+	Needed if platform does not support returning a value from main.
+
+	Valid values :
+	0 - main returns an int, and return value will be 0.
+	1 - platform does not support returning a value from main
+*/
+#ifndef MAIN_HAS_NORETURN
+#define MAIN_HAS_NORETURN 0
+#endif
+
+/* Variable : default_num_contexts
+	Not used for this simple port, must cintain the value 1.
+*/
+extern ee_u32 default_num_contexts;
+
+typedef struct CORE_PORTABLE_S {
+	ee_u8	portable_id;
+} core_portable;
+
+/* target specific init/fini */
+void portable_init(core_portable *p, int *argc, char *argv[]);
+void portable_fini(core_portable *p);
+
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE==1200)
+#define PROFILE_RUN 1
+#elif (TOTAL_DATA_SIZE==2000)
+#define PERFORMANCE_RUN 1
+#else
+#define VALIDATION_RUN 1
+#endif
+#endif
+
+
+#endif /* CORE_PORTME_H */
--- a/coremark/include/coremark.h
+++ b/coremark/include/coremark.h
@ -0,0 +1,174 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+/* Topic: Description
+	This file contains  declarations of the various benchmark functions.
+*/
+
+/* Configuration: TOTAL_DATA_SIZE
+	Define total size for data algorithms will operate on
+*/
+#ifndef TOTAL_DATA_SIZE
+#define TOTAL_DATA_SIZE 2*1000
+#endif
+
+#define SEED_ARG 0
+#define SEED_FUNC 1
+#define SEED_VOLATILE 2
+
+#define MEM_STATIC 0
+#define MEM_MALLOC 1
+#define MEM_STACK 2
+
+#include "core_portme.h"
+
+#if HAS_STDIO
+#include <stdio.h>
+#endif
+#if HAS_PRINTF
+#define ee_printf printf
+#endif
+
+/* Actual benchmark execution in iterate */
+void *iterate(void *pres);
+
+/* Typedef: secs_ret
+	For machines that have floating point support, get number of seconds as a double.
+	Otherwise an unsigned int.
+*/
+#if HAS_FLOAT
+typedef double secs_ret;
+#else
+typedef ee_u32 secs_ret;
+#endif
+
+#if MAIN_HAS_NORETURN
+#define MAIN_RETURN_VAL
+#define MAIN_RETURN_TYPE void
+#else
+#define MAIN_RETURN_VAL 0
+#define MAIN_RETURN_TYPE int
+#endif
+
+void start_time(void);
+void stop_time(void);
+CORE_TICKS get_time(void);
+secs_ret time_in_secs(CORE_TICKS ticks);
+
+/* Misc useful functions */
+ee_u16 crcu8(ee_u8 data, ee_u16 crc);
+ee_u16 crc16(ee_s16 newval, ee_u16 crc);
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
+ee_u8 check_data_types();
+void *portable_malloc(ee_size_t size);
+void portable_free(void *p);
+ee_s32 parseval(char *valstring);
+
+/* Algorithm IDS */
+#define ID_LIST 	(1<<0)
+#define ID_MATRIX 	(1<<1)
+#define ID_STATE 	(1<<2)
+#define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE)
+#define NUM_ALGORITHMS 3
+
+/* list data structures */
+typedef struct list_data_s {
+	ee_s16 data16;
+	ee_s16 idx;
+} list_data;
+
+typedef struct list_head_s {
+	struct list_head_s *next;
+	struct list_data_s *info;
+} list_head;
+
+
+/*matrix benchmark related stuff */
+#define MATDAT_INT 1
+#if MATDAT_INT
+typedef ee_s16 MATDAT;
+typedef ee_s32 MATRES;
+#else
+typedef ee_f16 MATDAT;
+typedef ee_f32 MATRES;
+#endif
+
+typedef struct MAT_PARAMS_S {
+	int N;
+	MATDAT *A;
+	MATDAT *B;
+	MATRES *C;
+} mat_params;
+
+/* state machine related stuff */
+/* List of all the possible states for the FSM */
+typedef enum CORE_STATE {
+	CORE_START=0,
+	CORE_INVALID,
+	CORE_S1,
+	CORE_S2,
+	CORE_INT,
+	CORE_FLOAT,
+	CORE_EXPONENT,
+	CORE_SCIENTIFIC,
+	NUM_CORE_STATES
+} core_state_e ;
+
+
+/* Helper structure to hold results */
+typedef struct RESULTS_S {
+	/* inputs */
+	ee_s16	seed1;		/* Initializing seed */
+	ee_s16	seed2;		/* Initializing seed */
+	ee_s16	seed3;		/* Initializing seed */
+	void	*memblock[4];	/* Pointer to safe memory location */
+	ee_u32	size;		/* Size of the data */
+	ee_u32 iterations;		/* Number of iterations to execute */
+	ee_u32	execs;		/* Bitmask of operations to execute */
+	struct list_head_s *list;
+	mat_params mat;
+	/* outputs */
+	ee_u16	crc;
+	ee_u16	crclist;
+	ee_u16	crcmatrix;
+	ee_u16	crcstate;
+	ee_s16	err;
+	/* ultithread specific */
+	core_portable port;
+} core_results;
+
+/* Multicore execution handling */
+#if (MULTITHREAD>1)
+ee_u8 core_start_parallel(core_results *res);
+ee_u8 core_stop_parallel(core_results *res);
+#endif
+
+/* list benchmark functions */
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx);
+
+/* state benchmark functions */
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc);
+
+/* matrix benchmark functions */
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p);
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
+
--- a/coremark/src/core_list_join.c
+++ b/coremark/src/core_list_join.c
@ -0,0 +1,496 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+	Benchmark using a linked list.
+
+	Linked list is a common data structure used in many applications.
+
+	For our purposes, this will excercise the memory units of the processor.
+	In particular, usage of the list pointers to find and alter data.
+
+	We are not using Malloc since some platforms do not support this library.
+
+	Instead, the memory block being passed in is used to create a list,
+	and the benchmark takes care not to add more items then can be
+	accomodated by the memory block. The porting layer will make sure
+	that we have a valid memory block.
+
+	All operations are done in place, without using any extra memory.
+
+	The list itself contains list pointers and pointers to data items.
+	Data items contain the following:
+
+	idx - An index that captures the initial order of the list.
+	data - Variable data initialized based on the input parameters. The 16b are divided as follows:
+	o Upper 8b are backup of original data.
+	o Bit 7 indicates if the lower 7 bits are to be used as is or calculated.
+	o Bits 0-2 indicate type of operation to perform to get a 7b value.
+	o Bits 3-6 provide input for the operation.
+
+*/
+
+/* local functions */
+
+list_head *core_list_find(list_head *list,list_data *info);
+list_head *core_list_reverse(list_head *list);
+list_head *core_list_remove(list_head *item);
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified);
+list_head *core_list_insert_new(list_head *insert_point
+	, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end);
+typedef ee_s32(*list_cmp)(list_data *a, list_data *b, core_results *res);
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res);
+
+ee_s16 calc_func(ee_s16 *pdata, core_results *res) {
+	ee_s16 data=*pdata;
+	ee_s16 retval;
+	ee_u8 optype=(data>>7) & 1; /* bit 7 indicates if the function result has been cached */
+	if (optype) /* if cached, use cache */
+		return (data & 0x007f);
+	else { /* otherwise calculate and cache the result */
+		ee_s16 flag=data & 0x7; /* bits 0-2 is type of function to perform */
+		ee_s16 dtype=((data>>3) & 0xf); /* bits 3-6 is specific data for the operation */
+		dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */
+		switch (flag) {
+			case 0:
+				if (dtype<0x22) /* set min period for bit corruption */
+					dtype=0x22;
+				retval=core_bench_state(res->size,res->memblock[3],res->seed1,res->seed2,dtype,res->crc);
+				if (res->crcstate==0)
+					res->crcstate=retval;
+				break;
+			case 1:
+				retval=core_bench_matrix(&(res->mat),dtype,res->crc);
+				if (res->crcmatrix==0)
+					res->crcmatrix=retval;
+				break;
+			default:
+				retval=data;
+				break;
+		}
+		res->crc=crcu16(retval,res->crc);
+		retval &= 0x007f;
+		*pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */
+		return retval;
+	}
+}
+/* Function: cmp_complex
+	Compare the data item in a list cell.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_complex(list_data *a, list_data *b, core_results *res) {
+	ee_s16 val1=calc_func(&(a->data16),res);
+	ee_s16 val2=calc_func(&(b->data16),res);
+	return val1 - val2;
+}
+
+/* Function: cmp_idx
+	Compare the idx item in a list cell, and regen the data.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) {
+	if (res==NULL) {
+		a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16>>8));
+		b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16>>8));
+	}
+	return a->idx - b->idx;
+}
+
+void copy_info(list_data *to,list_data *from) {
+	to->data16=from->data16;
+	to->idx=from->idx;
+}
+
+/* Benchmark for linked list:
+	- Try to find multiple data items.
+	- List sort
+	- Operate on data from list (crc)
+	- Single remove/reinsert
+	* At the end of this function, the list is back to original state
+*/
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
+	ee_u16 retval=0;
+	ee_u16 found=0,missed=0;
+	list_head *list=res->list;
+	ee_s16 find_num=res->seed3;
+	list_head *this_find;
+	list_head *finder, *remover;
+	list_data info = {};
+	ee_s16 i;
+
+	info.idx=finder_idx;
+	/* find <find_num> values in the list, and change the list each time (reverse and cache if value found) */
+	for (i=0; i<find_num; i++) {
+		info.data16= (i & 0xff) ;
+		this_find=core_list_find(list,&info);
+		list=core_list_reverse(list);
+		if (this_find==NULL) {
+			missed++;
+			retval+=(list->next->info->data16 >> 8) & 1;
+		}
+		else {
+			found++;
+			if (this_find->info->data16 & 0x1) /* use found value */
+				retval+=(this_find->info->data16 >> 9) & 1;
+			/* and cache next item at the head of the list (if any) */
+			if (this_find->next != NULL) {
+				finder = this_find->next;
+				this_find->next = finder->next;
+				finder->next=list->next;
+				list->next=finder;
+			}
+		}
+		if (info.idx>=0)
+			info.idx++;
+#if CORE_DEBUG
+	ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
+#endif
+	}
+	retval+=found*4-missed;
+	/* sort the list by data content and remove one item*/
+	if (finder_idx>0)
+		list=core_list_mergesort(list,cmp_complex,res);
+	remover=core_list_remove(list->next);
+	/* CRC data content of list from location of index N forward, and then undo remove */
+	finder=core_list_find(list,&info);
+	if (!finder)
+		finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 1: %04x\n",retval);
+#endif
+	remover=core_list_undo_remove(remover,list->next);
+	/* sort the list by index, in effect returning the list to original state */
+	list=core_list_mergesort(list,cmp_idx,NULL);
+	/* CRC data content of list */
+	finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 2: %04x\n",retval);
+#endif
+	return retval;
+}
+/* Function: core_list_init
+	Initialize list with data.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblock - Pointer to memory block.
+	seed - 	Actual values chosen depend on the seed parameter.
+		The seed parameter MUST be supplied from a source that cannot be determined at compile time
+
+	Returns:
+	Pointer to the head of the list.
+
+*/
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
+	/* calculated pointers for the list */
+	ee_u32 per_item=16+sizeof(struct list_data_s);
+	ee_u32 size=(blksize/per_item)-2; /* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
+	list_head *memblock_end=memblock+size;
+	list_data *datablock=(list_data *)(memblock_end);
+	list_data *datablock_end=datablock+size;
+	/* some useful variables */
+	ee_u32 i;
+	list_head *finder,*list=memblock;
+	list_data info;
+
+	/* create a fake items for the list head and tail */
+	list->next=NULL;
+	list->info=datablock;
+	list->info->idx=0x0000;
+	list->info->data16=(ee_s16)0x8080;
+	memblock++;
+	datablock++;
+	info.idx=0x7fff;
+	info.data16=(ee_s16)0xffff;
+	core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+
+	/* then insert size items */
+	for (i=0; i<size; i++) {
+		ee_u16 datpat=((ee_u16)(seed^i) & 0xf);
+		ee_u16 dat=(datpat<<3) | (i&0x7); /* alternate between algorithms */
+		info.data16=(dat<<8) | dat;		/* fill the data with actual data and upper bits with rebuild value */
+		core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+	}
+	/* and now index the list so we know initial seed order of the list */
+	finder=list->next;
+	i=1;
+	while (finder->next!=NULL) {
+		if (i<size/5) /* first 20% of the list in order */
+			finder->info->idx=i++;
+		else {
+			ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */
+			finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */
+		}
+		finder=finder->next;
+	}
+	list = core_list_mergesort(list,cmp_idx,NULL);
+#if CORE_DEBUG
+	ee_printf("Initialized list:\n");
+	finder=list;
+	while (finder) {
+		ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
+		finder=finder->next;
+	}
+	ee_printf("\n");
+#endif
+	return list;
+}
+
+/* Function: core_list_insert
+	Insert an item to the list
+
+	Parameters:
+	insert_point - where to insert the item.
+	info - data for the cell.
+	memblock - pointer for the list header
+	datablock - pointer for the list data
+	memblock_end - end of region for list headers
+	datablock_end - end of region for list data
+
+	Returns:
+	Pointer to new item.
+*/
+list_head *core_list_insert_new(list_head *insert_point, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end) {
+	list_head *newitem;
+
+	if ((*memblock+1) >= memblock_end)
+		return NULL;
+	if ((*datablock+1) >= datablock_end)
+		return NULL;
+
+	newitem=*memblock;
+	(*memblock)++;
+	newitem->next=insert_point->next;
+	insert_point->next=newitem;
+
+	newitem->info=*datablock;
+	(*datablock)++;
+	copy_info(newitem->info,info);
+
+	return newitem;
+}
+
+/* Function: core_list_remove
+	Remove an item from the list.
+
+	Operation:
+	For a singly linked list, remove by copying the data from the next item
+	over to the current cell, and unlinking the next item.
+
+	Note:
+	since there is always a fake item at the end of the list, no need to check for NULL.
+
+	Returns:
+	Removed item.
+*/
+list_head *core_list_remove(list_head *item) {
+	list_data *tmp;
+	list_head *ret=item->next;
+	/* swap data pointers */
+	tmp=item->info;
+	item->info=ret->info;
+	ret->info=tmp;
+	/* and eliminate item */
+	item->next=item->next->next;
+	ret->next=NULL;
+	return ret;
+}
+
+/* Function: core_list_undo_remove
+	Undo a remove operation.
+
+	Operation:
+	Since we want each iteration of the benchmark to be exactly the same,
+	we need to be able to undo a remove.
+	Link the removed item back into the list, and switch the info items.
+
+	Parameters:
+	item_removed - Return value from the <core_list_remove>
+	item_modified - List item that was modified during <core_list_remove>
+
+	Returns:
+	The item that was linked back to the list.
+
+*/
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified) {
+	list_data *tmp;
+	/* swap data pointers */
+	tmp=item_removed->info;
+	item_removed->info=item_modified->info;
+	item_modified->info=tmp;
+	/* and insert item */
+	item_removed->next=item_modified->next;
+	item_modified->next=item_removed;
+	return item_removed;
+}
+
+/* Function: core_list_find
+	Find an item in the list
+
+	Operation:
+	Find an item by idx (if not 0) or specific data value
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+list_head *core_list_find(list_head *list,list_data *info) {
+	if (info->idx>=0) {
+		while (list && (list->info->idx != info->idx))
+			list=list->next;
+		return list;
+	} else {
+		while (list && ((list->info->data16 & 0xff) != info->data16))
+			list=list->next;
+		return list;
+	}
+}
+/* Function: core_list_reverse
+	Reverse a list
+
+	Operation:
+	Rearrange the pointers so the list is reversed.
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+
+list_head *core_list_reverse(list_head *list) {
+	list_head *next=NULL, *tmp;
+	while (list) {
+		tmp=list->next;
+		list->next=next;
+		next=list;
+		list=tmp;
+	}
+	return next;
+}
+/* Function: core_list_mergesort
+	Sort the list in place without recursion.
+
+	Description:
+	Use mergesort, as for linked list this is a realistic solution.
+	Also, since this is aimed at embedded, care was taken to use iterative rather then recursive algorithm.
+	The sort can either return the list to original order (by idx) ,
+	or use the data item to invoke other other algorithms and change the order of the list.
+
+	Parameters:
+	list - list to be sorted.
+	cmp - cmp function to use
+
+	Returns:
+	New head of the list.
+
+	Note:
+	We have a special header for the list that will always be first,
+	but the algorithm could theoretically modify where the list starts.
+
+ */
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res) {
+    list_head *p, *q, *e, *tail;
+    ee_s32 insize, nmerges, psize, qsize, i;
+
+    insize = 1;
+
+    while (1) {
+        p = list;
+        list = NULL;
+        tail = NULL;
+
+        nmerges = 0;  /* count number of merges we do in this pass */
+
+        while (p) {
+            nmerges++;  /* there exists a merge to be done */
+            /* step `insize' places along from p */
+            q = p;
+            psize = 0;
+            for (i = 0; i < insize; i++) {
+                psize++;
+			    q = q->next;
+                if (!q) break;
+            }
+
+            /* if q hasn't fallen off end, we have two lists to merge */
+            qsize = insize;
+
+            /* now we have two lists; merge them */
+            while (psize > 0 || (qsize > 0 && q)) {
+
+				/* decide whether next element of merge comes from p or q */
+				if (psize == 0) {
+				    /* p is empty; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				} else if (qsize == 0 || !q) {
+				    /* q is empty; e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else if (cmp(p->info,q->info,res) <= 0) {
+				    /* First element of p is lower (or same); e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else {
+				    /* First element of q is lower; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				}
+
+		        /* add the next element to the merged list */
+				if (tail) {
+				    tail->next = e;
+				} else {
+				    list = e;
+				}
+				tail = e;
+	        }
+
+			/* now p has stepped `insize' places along, and q has too */
+			p = q;
+        }
+
+	    tail->next = NULL;
+
+        /* If we have done only one merge, we're finished. */
+        if (nmerges <= 1)   /* allow for nmerges==0, the empty list case */
+            return list;
+
+        /* Otherwise repeat, merging lists twice the size */
+        insize *= 2;
+    }
+#if COMPILER_REQUIRES_SORT_RETURN
+	return list;
+#endif
+}
--- a/coremark/src/core_main.c
+++ b/coremark/src/core_main.c
@ -0,0 +1,339 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+/* File: core_main.c
+	This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results.
+*/
+#include "coremark.h"
+
+/* Function: iterate
+	Run the benchmark for a specified number of iterations.
+
+	Operation:
+	For each type of benchmarked algorithm:
+		a - Initialize the data block for the algorithm.
+		b - Execute the algorithm N times.
+
+	Returns:
+	NULL.
+*/
+static ee_u16 list_known_crc[]   =      {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1};
+static ee_u16 matrix_known_crc[] =      {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747};
+static ee_u16 state_known_crc[]  =      {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84};
+void *iterate(void *pres) {
+	ee_u32 i;
+	ee_u16 crc;
+	core_results *res=(core_results *)pres;
+	ee_u32 iterations=res->iterations;
+	res->crc=0;
+	res->crclist=0;
+	res->crcmatrix=0;
+	res->crcstate=0;
+
+	for (i=0; i<iterations; i++) {
+		crc=core_bench_list(res,1);
+		res->crc=crcu16(crc,res->crc);
+		crc=core_bench_list(res,-1);
+		res->crc=crcu16(crc,res->crc);
+		if (i==0) res->crclist=res->crc;
+	}
+	return NULL;
+}
+
+#if (SEED_METHOD==SEED_ARG)
+ee_s32 get_seed_args(int i, int argc, char *argv[]);
+#define get_seed(x) (ee_s16)get_seed_args(x,argc,argv)
+#define get_seed_32(x) get_seed_args(x,argc,argv)
+#else /* via function or volatile */
+ee_s32 get_seed_32(int i);
+#define get_seed(x) (ee_s16)get_seed_32(x)
+#endif
+
+#if (MEM_METHOD==MEM_STATIC)
+ee_u8 static_memblk[TOTAL_DATA_SIZE];
+#endif
+char *mem_name[3] = {"Static","Heap","Stack"};
+/* Function: main
+	Main entry routine for the benchmark.
+	This function is responsible for the following steps:
+
+	1 - Initialize input seeds from a source that cannot be determined at compile time.
+	2 - Initialize memory block for use.
+	3 - Run and time the benchmark.
+	4 - Report results, testing the validity of the output if the seeds are known.
+
+	Arguments:
+	1 - first seed  : Any value
+	2 - second seed : Must be identical to first for iterations to be identical
+	3 - third seed  : Any value, should be at least an order of magnitude less then the input size, but bigger then 32.
+	4 - Iterations  : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs
+
+*/
+
+#if MAIN_HAS_NOARGC
+MAIN_RETURN_TYPE main(void) {
+	int argc=0;
+	char *argv[1];
+#else
+MAIN_RETURN_TYPE main(int argc, char *argv[]) {
+#endif
+	ee_u16 i,j=0,num_algorithms=0;
+	ee_s16 known_id=-1,total_errors=0;
+	ee_u16 seedcrc=0;
+	CORE_TICKS total_time;
+	core_results results[MULTITHREAD];
+#if (MEM_METHOD==MEM_STACK)
+	ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD];
+#endif
+
+  ioe_init();
+
+  ee_printf("Running CoreMark for %d iterations\n", ITERATIONS);
+
+	/* first call any initializations needed */
+	portable_init(&(results[0].port), &argc, argv);
+	/* First some checks to make sure benchmark will run ok */
+	if (sizeof(struct list_head_s)>128) {
+		ee_printf("list_head structure too big for comparable data!\n");
+		return MAIN_RETURN_VAL;
+	}
+	results[0].seed1=get_seed(1);
+	results[0].seed2=get_seed(2);
+	results[0].seed3=get_seed(3);
+	results[0].iterations=get_seed_32(4);
+#if CORE_DEBUG
+	results[0].iterations=1;
+#endif
+	results[0].execs=get_seed_32(5);
+	if (results[0].execs==0) { /* if not supplied, execute all algorithms */
+		results[0].execs=ALL_ALGORITHMS_MASK;
+	}
+		/* put in some default values based on one seed only for easy testing */
+	if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */
+		results[0].seed1=0;
+		results[0].seed2=0;
+		results[0].seed3=0x66;
+	}
+	if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */
+		results[0].seed1=0x3415;
+		results[0].seed2=0x3415;
+		results[0].seed3=0x66;
+	}
+#if (MEM_METHOD==MEM_STATIC)
+	results[0].memblock[0]=(void *)static_memblk;
+	results[0].size=TOTAL_DATA_SIZE;
+	results[0].err=0;
+	#if (MULTITHREAD>1)
+	#error "Cannot use a static data area with multiple contexts!"
+	#endif
+#elif (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		ee_s32 malloc_override=get_seed(7);
+		if (malloc_override != 0)
+			results[i].size=malloc_override;
+		else
+			results[i].size=TOTAL_DATA_SIZE;
+		results[i].memblock[0]=portable_malloc(results[i].size);
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#elif (MEM_METHOD==MEM_STACK)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		results[i].memblock[0]=stack_memblock+i*TOTAL_DATA_SIZE;
+		results[i].size=TOTAL_DATA_SIZE;
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#else
+#error "Please define a way to initialize a memory block."
+#endif
+	/* Data init */
+	/* Find out how space much we have based on number of algorithms */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		if ((1<<(ee_u32)i) & results[0].execs)
+			num_algorithms++;
+	}
+	for (i=0 ; i<MULTITHREAD; i++)
+		results[i].size=results[i].size/num_algorithms;
+	/* Assign pointers */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		ee_u32 ctx;
+		if ((1<<(ee_u32)i) & results[0].execs) {
+			for (ctx=0 ; ctx<MULTITHREAD; ctx++)
+				results[ctx].memblock[i+1]=(char *)(results[ctx].memblock[0])+results[0].size*j;
+			j++;
+		}
+	}
+	/* call inits */
+	for (i=0 ; i<MULTITHREAD; i++) {
+		if (results[i].execs & ID_LIST) {
+			results[i].list=core_list_init(results[0].size,results[i].memblock[1],results[i].seed1);
+		}
+		if (results[i].execs & ID_MATRIX) {
+			core_init_matrix(results[0].size, results[i].memblock[2], (ee_s32)results[i].seed1 | (((ee_s32)results[i].seed2) << 16), &(results[i].mat) );
+		}
+		if (results[i].execs & ID_STATE) {
+			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
+		}
+	}
+
+	/* automatically determine number of iterations if not set */
+	if (results[0].iterations==0) {
+		secs_ret secs_passed=0;
+		ee_u32 divisor;
+		results[0].iterations=1;
+		while (secs_passed < (secs_ret)1) {
+			results[0].iterations*=10;
+			start_time();
+			iterate(&results[0]);
+			stop_time();
+			secs_passed=time_in_secs(get_time());
+		}
+		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
+		divisor=(ee_u32)secs_passed;
+		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
+			divisor=1;
+		results[0].iterations*=1+10/divisor;
+	}
+	/* perform actual benchmark */
+	start_time();
+#if (MULTITHREAD>1)
+	if (default_num_contexts>MULTITHREAD) {
+		default_num_contexts=MULTITHREAD;
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		results[i].iterations=results[0].iterations;
+		results[i].execs=results[0].execs;
+		core_start_parallel(&results[i]);
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		core_stop_parallel(&results[i]);
+	}
+#else
+	iterate(&results[0]);
+#endif
+	stop_time();
+	total_time=get_time();
+	/* get a function of the input to report */
+	seedcrc=crc16(results[0].seed1,seedcrc);
+	seedcrc=crc16(results[0].seed2,seedcrc);
+	seedcrc=crc16(results[0].seed3,seedcrc);
+	seedcrc=crc16(results[0].size,seedcrc);
+
+	switch (seedcrc) { /* test known output for common seeds */
+		case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
+			known_id=0;
+			ee_printf("6k performance run parameters for coremark.\n");
+			break;
+		case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per algorithm */
+			known_id=1;
+			ee_printf("6k validation run parameters for coremark.\n");
+			break;
+		case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm */
+			known_id=2;
+			ee_printf("Profile generation run parameters for coremark.\n");
+			break;
+		case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
+			known_id=3;
+			ee_printf("2K performance run parameters for coremark.\n");
+			break;
+		case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per algorithm */
+			known_id=4;
+			ee_printf("2K validation run parameters for coremark.\n");
+			break;
+		default:
+			total_errors=-1;
+			break;
+	}
+	if (known_id>=0) {
+		for (i=0 ; i<default_num_contexts; i++) {
+			results[i].err=0;
+			if ((results[i].execs & ID_LIST) &&
+				(results[i].crclist!=list_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",i,results[i].crclist,list_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_MATRIX) &&
+				(results[i].crcmatrix!=matrix_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",i,results[i].crcmatrix,matrix_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_STATE) &&
+				(results[i].crcstate!=state_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",i,results[i].crcstate,state_known_crc[known_id]);
+				results[i].err++;
+			}
+			total_errors+=results[i].err;
+		}
+	}
+	total_errors+=check_data_types();
+	/* and report results */
+	ee_printf("CoreMark Size    : %d\n",(int)results[0].size);
+#if HAS_FLOAT
+	ee_printf("Total time (ms)  : %f\n",time_in_secs(total_time));
+	if (time_in_secs(total_time) > 0)
+		ee_printf("Iterations/mSec  : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
+#else
+	ee_printf("Total time (ms)  : %d\n",time_in_secs(total_time));
+#endif
+	ee_printf("Iterations       : %d\n",(int)default_num_contexts*results[0].iterations);
+	ee_printf("Compiler version : %s\n",COMPILER_VERSION);
+#if (MULTITHREAD>1)
+	ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts);
+#endif
+	/* output for verification */
+	ee_printf("seedcrc          : 0x%04x\n",seedcrc);
+	if (results[0].execs & ID_LIST)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crclist       : 0x%04x\n",i,results[i].crclist);
+	if (results[0].execs & ID_MATRIX)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crcmatrix     : 0x%04x\n",i,results[i].crcmatrix);
+	if (results[0].execs & ID_STATE)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
+	for (i=0 ; i<default_num_contexts; i++)
+		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
+  ee_printf("Finised in %d ms.\n", (int)total_time);
+	if (total_errors==0) {
+    ee_printf("==================================================\n");
+	  ee_printf("CoreMark PASS       %d Marks\n", 2921400 / time_in_secs(total_time) * ITERATIONS / 1000);
+	  ee_printf("                vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
+  }
+	if (total_errors>0)
+		ee_printf("Errors detected\n");
+	if (total_errors<0)
+		ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n");
+
+#if (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++)
+		portable_free(results[i].memblock[0]);
+#endif
+	/* And last call any target specific code for finalizing */
+	portable_fini(&(results[0].port));
+
+	return MAIN_RETURN_VAL;
+}
+
+
--- a/coremark/src/core_matrix.c
+++ b/coremark/src/core_matrix.c
@ -0,0 +1,308 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/*
+Topic: Description
+	Matrix manipulation benchmark
+
+	This very simple algorithm forms the basis of many more complex algorithms.
+
+	The tight inner loop is the focus of many optimizations (compiler as well as hardware based)
+	and is thus relevant for embedded processing.
+
+	The total available data space will be divided to 3 parts:
+	NxN Matrix A - initialized with small values (upper 3/4 of the bits all zero).
+	NxN Matrix B - initialized with medium values (upper half of the bits all zero).
+	NxN Matrix C - used for the result.
+
+	The actual values for A and B must be derived based on input that is not available at compile time.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
+
+#define matrix_test_next(x) (x+1)
+#define matrix_clip(x,y) ((y) ? (x) & 0x0ff : (x) & 0x0ffff)
+#define matrix_big(x) (0xf000 | (x))
+#define bit_extract(x,from,to) (((x)>>(from)) & (~(0xffffffff << (to))))
+
+#if CORE_DEBUG
+void printmat(MATDAT *A, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",A[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+void printmatC(MATRES *C, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",C[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+#endif
+/* Function: core_bench_matrix
+	Benchmark function
+
+	Iterate <matrix_test> N times,
+	changing the matrix values slightly by a constant amount each time.
+*/
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) {
+	ee_u32 N=p->N;
+	MATRES *C=p->C;
+	MATDAT *A=p->A;
+	MATDAT *B=p->B;
+	MATDAT val=(MATDAT)seed;
+
+	crc=crc16(matrix_test(N,C,A,B,val),crc);
+
+	return crc;
+}
+
+/* Function: matrix_test
+	Perform matrix manipulation.
+
+	Parameters:
+	N - Dimensions of the matrix.
+	C - memory for result matrix.
+	A - input matrix
+	B - operator matrix (not changed during operations)
+
+	Returns:
+	A CRC value that captures all results calculated in the function.
+	In particular, crc of the value calculated on the result matrix
+	after each step by <matrix_sum>.
+
+	Operation:
+
+	1 - Add a constant value to all elements of a matrix.
+	2 - Multiply a matrix by a constant.
+	3 - Multiply a matrix by a vector.
+	4 - Multiply a matrix by a matrix.
+	5 - Add a constant value to all elements of a matrix.
+
+	After the last step, matrix A is back to original contents.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) {
+	ee_u16 crc=0;
+	MATDAT clipval=matrix_big(val);
+
+	matrix_add_const(N,A,val); /* make sure data changes  */
+#if CORE_DEBUG
+	printmat(A,N,"matrix_add_const");
+#endif
+	matrix_mul_const(N,C,A,val);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_const");
+#endif
+	matrix_mul_vect(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_vect");
+#endif
+	matrix_mul_matrix(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix");
+#endif
+	matrix_mul_matrix_bitextract(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix_bitextract");
+#endif
+
+	matrix_add_const(N,A,-val); /* return matrix to initial value */
+	return crc;
+}
+
+/* Function : matrix_init
+	Initialize the memory block for matrix benchmarking.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblk - Pointer to memory block.
+	seed - Actual values chosen depend on the seed parameter.
+	p - pointers to <mat_params> containing initialized matrixes.
+
+	Returns:
+	Matrix dimensions.
+
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) {
+	ee_u32 N=0;
+	MATDAT *A;
+	MATDAT *B;
+	ee_s32 order=1;
+	MATDAT val;
+	ee_u32 i=0,j=0;
+	if (seed==0)
+		seed=1;
+	while (j<blksize) {
+		i++;
+		j=i*i*2*4;
+	}
+	N=i-1;
+	A=(MATDAT *)align_mem(memblk);
+	B=A+N*N;
+
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			seed = ( ( order * seed ) % 65536 );
+			val = (seed + order);
+			val=matrix_clip(val,0);
+			B[i*N+j] = val;
+			val =  (val + order);
+			val=matrix_clip(val,1);
+			A[i*N+j] = val;
+			order++;
+		}
+	}
+
+	p->A=A;
+	p->B=B;
+	p->C=(MATRES *)align_mem(B+N*N);
+	p->N=N;
+#if CORE_DEBUG
+	printmat(A,N,"A");
+	printmat(B,N,"B");
+#endif
+	return N;
+}
+
+/* Function: matrix_sum
+	Calculate a function that depends on the values of elements in the matrix.
+
+	For each element, accumulate into a temporary variable.
+
+	As long as this value is under the parameter clipval,
+	add 1 to the result if the element is bigger then the previous.
+
+	Otherwise, reset the accumulator and add 10 to the result.
+*/
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) {
+	MATRES tmp=0,prev=0,cur=0;
+	ee_s16 ret=0;
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			cur=C[i*N+j];
+			tmp+=cur;
+			if (tmp>clipval) {
+				ret+=10;
+				tmp=0;
+			} else {
+				ret += (cur>prev) ? 1 : 0;
+			}
+			prev=cur;
+		}
+	}
+	return ret;
+}
+
+/* Function: matrix_mul_const
+	Multiply a matrix by a constant.
+	This could be used as a scaler for instance.
+*/
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=(MATRES)A[i*N+j] * (MATRES)val;
+		}
+	}
+}
+
+/* Function: matrix_add_const
+	Add a constant value to all elements of a matrix.
+*/
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			A[i*N+j] += val;
+		}
+	}
+}
+
+/* Function: matrix_mul_vect
+	Multiply a matrix by a vector.
+	This is common in many simple filters (e.g. fir where a vector of coefficients is applied to the matrix.)
+*/
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		C[i]=0;
+		for (j=0; j<N; j++) {
+			C[i]+=(MATRES)A[i*N+j] * (MATRES)B[j];
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix
+	Multiply a matrix by a matrix.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				C[i*N+j]+=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+			}
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix_bitextract
+	Multiply a matrix by a matrix, and extract some bits from the result.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				MATRES tmp=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+				C[i*N+j]+=bit_extract(tmp,2,4)*bit_extract(tmp,5,7);
+			}
+		}
+	}
+}
--- a/coremark/src/core_portme.c
+++ b/coremark/src/core_portme.c
@ -0,0 +1,109 @@
+#include "coremark.h"
+
+#if VALIDATION_RUN
+	volatile ee_s32 seed1_volatile=0x3415;
+	volatile ee_s32 seed2_volatile=0x3415;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PERFORMANCE_RUN
+	volatile ee_s32 seed1_volatile=0x0;
+	volatile ee_s32 seed2_volatile=0x0;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PROFILE_RUN
+	volatile ee_s32 seed1_volatile=0x8;
+	volatile ee_s32 seed2_volatile=0x8;
+	volatile ee_s32 seed3_volatile=0x8;
+#endif
+	volatile ee_s32 seed4_volatile=ITERATIONS;
+	volatile ee_s32 seed5_volatile=0;
+/* Porting : Timing functions
+	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
+	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc.
+	Sample implementation for standard time.h and windows.h definitions included.
+*/
+/* Define : TIMER_RES_DIVIDER
+	Divider to trade off timer resolution and total time that can be measured.
+
+	Use lower values to increase resolution, but make sure that overflow does not occur.
+	If there are issues with the return value overflowing, increase this value.
+	*/
+#define NSECS_PER_SEC CLOCKS_PER_SEC
+#define CORETIMETYPE clock_t
+#define GETMYTIME(_t) (*_t=clock())
+#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
+#define TIMER_RES_DIVIDER 1
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
+
+static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
+
+/** Define Host specific (POSIX), or target specific global time variables. */
+unsigned long start_time_val, stop_time_val;
+
+/* Function : start_time
+	This function will be called right before starting the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
+*/
+void start_time(void) {
+  start_time_val = uptime_ms();
+}
+/* Function : stop_time
+	This function will be called right after ending the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or other system parameters - e.g. reading the current value of cpu cycles counter.
+*/
+void stop_time(void) {
+  stop_time_val = uptime_ms();
+}
+/* Function : get_time
+	Return an abstract "ticks" number that signifies time on the system.
+
+	Actual value returned may be cpu cycles, milliseconds or any other value,
+	as long as it can be converted to seconds by <time_in_secs>.
+	This methodology is taken to accomodate any hardware or simulated platform.
+	The sample implementation returns millisecs by default,
+	and the resolution is controlled by <TIMER_RES_DIVIDER>
+*/
+CORE_TICKS get_time(void) {
+  return stop_time_val - start_time_val;
+}
+
+/* Function : time_in_secs
+	Convert the value returned by get_time to seconds.
+
+	The <secs_ret> type is used to accomodate systems with no support for floating point.
+	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
+*/
+secs_ret time_in_secs(CORE_TICKS ticks) {
+  return ticks;
+}
+
+ee_u32 default_num_contexts=1;
+
+/* Function : portable_init
+	Target specific initialization code
+	Test for some common mistakes.
+*/
+void portable_init(core_portable *p, int *argc, char *argv[])
+{
+	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
+		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+	}
+	p->portable_id=1;
+}
+/* Function : portable_fini
+	Target specific final code
+*/
+void portable_fini(core_portable *p)
+{
+	p->portable_id=0;
+}
+
+
--- a/coremark/src/core_state.c
+++ b/coremark/src/core_state.c
@ -0,0 +1,277 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/* local functions */
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count);
+
+/*
+Topic: Description
+	Simple state machines like this one are used in many embedded products.
+
+	For more complex state machines, sometimes a state transition table implementation is used instead,
+	trading speed of direct coding for ease of maintenance.
+
+	Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour,
+	we are using a small moore machine.
+
+	In particular, this machine tests type of string input,
+	trying to determine whether the input is a number or something else.
+	(see core_state.png).
+*/
+
+/* Function: core_bench_state
+	Benchmark function
+
+	Go over the input twice, once direct, and once after introducing some corruption.
+*/
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc)
+{
+	ee_u32 final_counts[NUM_CORE_STATES];
+	ee_u32 track_counts[NUM_CORE_STATES];
+	ee_u8 *p=memblock;
+	ee_u32 i;
+
+
+#if CORE_DEBUG
+	ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc);
+#endif
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		final_counts[i]=track_counts[i]=0;
+	}
+	/* run the state machine over the input */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* insert some corruption */
+		if (*p!=',')
+			*p^=(ee_u8)seed1;
+		p+=step;
+	}
+	p=memblock;
+	/* run the state machine over the input again */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* undo corruption is seed1 and seed2 are equal */
+		if (*p!=',')
+			*p^=(ee_u8)seed2;
+		p+=step;
+	}
+	/* end timing */
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		crc=crcu32(final_counts[i],crc);
+		crc=crcu32(track_counts[i],crc);
+	}
+	return crc;
+}
+
+/* Default initialization patterns */
+static ee_u8 *intpat[4]  ={(ee_u8 *)"5012",(ee_u8 *)"1234",(ee_u8 *)"-874",(ee_u8 *)"+122"};
+static ee_u8 *floatpat[4]={(ee_u8 *)"35.54400",(ee_u8 *)".1234500",(ee_u8 *)"-110.700",(ee_u8 *)"+0.64400"};
+static ee_u8 *scipat[4]  ={(ee_u8 *)"5.500e+3",(ee_u8 *)"-.123e-2",(ee_u8 *)"-87e+832",(ee_u8 *)"+0.6e-12"};
+static ee_u8 *errpat[4]  ={(ee_u8 *)"T0.3e-1F",(ee_u8 *)"-T.T++Tq",(ee_u8 *)"1T3.4e4z",(ee_u8 *)"34.0e-T^"};
+
+/* Function: core_init_state
+	Initialize the input data for the state machine.
+
+	Populate the input with several predetermined strings, interspersed.
+	Actual patterns chosen depend on the seed parameter.
+
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p) {
+	ee_u32 total=0,next=0,i;
+	ee_u8 *buf=0;
+#if CORE_DEBUG
+	ee_u8 *start=p;
+	ee_printf("State: %d,%d\n",size,seed);
+#endif
+	size--;
+	next=0;
+	while ((total+next+1)<size) {
+		if (next>0) {
+			for(i=0;i<next;i++)
+				*(p+total+i)=buf[i];
+			*(p+total+i)=',';
+			total+=next+1;
+		}
+		seed++;
+		switch (seed & 0x7) {
+			case 0: /* int */
+			case 1: /* int */
+			case 2: /* int */
+				buf=intpat[(seed>>3) & 0x3];
+				next=4;
+			break;
+			case 3: /* float */
+			case 4: /* float */
+				buf=floatpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 5: /* scientific */
+			case 6: /* scientific */
+				buf=scipat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 7: /* invalid */
+				buf=errpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			default: /* Never happen, just to make some compilers happy */
+			break;
+		}
+	}
+	size++;
+	while (total<size) { /* fill the rest with 0 */
+		*(p+total)=0;
+		total++;
+	}
+#if CORE_DEBUG
+	ee_printf("State Input: %s\n",start);
+#endif
+}
+
+static ee_u8 ee_isdigit(ee_u8 c) {
+	ee_u8 retval;
+	retval = ((c>='0') & (c<='9')) ? 1 : 0;
+	return retval;
+}
+
+/* Function: core_state_transition
+	Actual state machine.
+
+	The state machine will continue scanning until either:
+	1 - an invalid input is detcted.
+	2 - a valid number has been detected.
+
+	The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid).
+*/
+
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) {
+	ee_u8 *str=*instr;
+	ee_u8 NEXT_SYMBOL;
+	enum CORE_STATE state=CORE_START;
+	for( ; *str && state != CORE_INVALID; str++ ) {
+		NEXT_SYMBOL = *str;
+		if (NEXT_SYMBOL==',') /* end of this input */ {
+			str++;
+			break;
+		}
+		switch(state) {
+		case CORE_START:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+			}
+			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_S1;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			transition_count[CORE_START]++;
+			break;
+		case CORE_S1:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+				transition_count[CORE_S1]++;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_S1]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S1]++;
+			}
+			break;
+		case CORE_INT:
+			if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_INT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INT]++;
+			}
+			break;
+		case CORE_FLOAT:
+			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
+				state = CORE_S2;
+				transition_count[CORE_FLOAT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_FLOAT]++;
+			}
+			break;
+		case CORE_S2:
+			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_EXPONENT;
+				transition_count[CORE_S2]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S2]++;
+			}
+			break;
+		case CORE_EXPONENT:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_SCIENTIFIC;
+				transition_count[CORE_EXPONENT]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_EXPONENT]++;
+			}
+			break;
+		case CORE_SCIENTIFIC:
+			if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	*instr=str;
+	return state;
+}
--- a/coremark/src/core_util.c
+++ b/coremark/src/core_util.c
@ -0,0 +1,210 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/* Function: get_seed
+	Get a values that cannot be determined at compile time.
+
+	Since different embedded systems and compilers are used, 3 different methods are provided:
+	1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that
+	reads the value of a volatile variable from memory at run time.
+	Please note, if using this method, you would need to modify core_portme.c to generate training profile.
+	2 - Command line arguments. This is the preferred method if command line arguments are supported.
+	3 - System function. If none of the first 2 methods is available on the platform,
+	a system function which is not a stub can be used.
+
+	e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions.
+*/
+#if (SEED_METHOD==SEED_VOLATILE)
+	extern volatile ee_s32 seed1_volatile;
+	extern volatile ee_s32 seed2_volatile;
+	extern volatile ee_s32 seed3_volatile;
+	extern volatile ee_s32 seed4_volatile;
+	extern volatile ee_s32 seed5_volatile;
+	ee_s32 get_seed_32(int i) {
+		ee_s32 retval;
+		switch (i) {
+			case 1:
+				retval=seed1_volatile;
+				break;
+			case 2:
+				retval=seed2_volatile;
+				break;
+			case 3:
+				retval=seed3_volatile;
+				break;
+			case 4:
+				retval=seed4_volatile;
+				break;
+			case 5:
+				retval=seed5_volatile;
+				break;
+			default:
+				retval=0;
+				break;
+		}
+		return retval;
+	}
+#elif (SEED_METHOD==SEED_ARG)
+ee_s32 parseval(char *valstring) {
+	ee_s32 retval=0;
+	ee_s32 neg=1;
+	int hexmode=0;
+	if (*valstring == '-') {
+		neg=-1;
+		valstring++;
+	}
+	if ((valstring[0] == '0') && (valstring[1] == 'x')) {
+		hexmode=1;
+		valstring+=2;
+	}
+		/* first look for digits */
+	if (hexmode) {
+		while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) {
+			ee_s32 digit=*valstring-'0';
+			if (digit>9)
+				digit=10+*valstring-'a';
+			retval*=16;
+			retval+=digit;
+			valstring++;
+		}
+	} else {
+		while ((*valstring >= '0') && (*valstring <= '9')) {
+			ee_s32 digit=*valstring-'0';
+			retval*=10;
+			retval+=digit;
+			valstring++;
+		}
+	}
+	/* now add qualifiers */
+	if (*valstring=='K')
+		retval*=1024;
+	if (*valstring=='M')
+		retval*=1024*1024;
+
+	retval*=neg;
+	return retval;
+}
+
+ee_s32 get_seed_args(int i, int argc, char *argv[]) {
+	if (argc>i)
+		return parseval(argv[i]);
+	return 0;
+}
+
+#elif (SEED_METHOD==SEED_FUNC)
+/* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */
+ee_s32 get_seed_32(int i) {
+	ee_s32 retval;
+	switch (i) {
+		case 1:
+			retval=portme_sys1();
+			break;
+		case 2:
+			retval=portme_sys2();
+			break;
+		case 3:
+			retval=portme_sys3();
+			break;
+		case 4:
+			retval=portme_sys4();
+			break;
+		case 5:
+			retval=portme_sys5();
+			break;
+		default:
+			retval=0;
+			break;
+	}
+	return retval;
+}
+#endif
+
+/* Function: crc*
+	Service functions to calculate 16b CRC code.
+
+*/
+ee_u16 crcu8(ee_u8 data, ee_u16 crc )
+{
+	ee_u8 i=0,x16=0,carry=0;
+
+	for (i = 0; i < 8; i++)
+    {
+		x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+		data >>= 1;
+
+		if (x16 == 1)
+		{
+		   crc ^= 0x4002;
+		   carry = 1;
+		}
+		else
+			carry = 0;
+		crc >>= 1;
+		if (carry)
+		   crc |= 0x8000;
+		else
+		   crc &= 0x7fff;
+    }
+	return crc;
+}
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc) {
+	crc=crcu8( (ee_u8) (newval)				,crc);
+	crc=crcu8( (ee_u8) ((newval)>>8)	,crc);
+	return crc;
+}
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc) {
+	crc=crc16((ee_s16) newval		,crc);
+	crc=crc16((ee_s16) (newval>>16)	,crc);
+	return crc;
+}
+ee_u16 crc16(ee_s16 newval, ee_u16 crc) {
+	return crcu16((ee_u16)newval, crc);
+}
+
+ee_u8 check_data_types() {
+	ee_u8 retval=0;
+	if (sizeof(ee_u8) != 1) {
+		ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u16) != 2) {
+		ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s16) != 2) {
+		ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s32) != 4) {
+		ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_ptr_int) != sizeof(int *)) {
+		ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
+		retval++;
+	}
+	if (retval>0) {
+		ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
+	}
+	return retval;
+}
--- a/dhrystone/Makefile
+++ b/dhrystone/Makefile
@ -0,0 +1,3 @@
+NAME = dhrystone
+SRCS = dry.c
+include $(AM_HOME)/Makefile
--- a/dhrystone/dry.c
+++ b/dhrystone/dry.c
@ -0,0 +1,950 @@
+/****************** "DHRYSTONE" Benchmark Program ***************************/
+#define Version "C, Version 2.2"
+/*  File:       dhry_1.c (part 2 of 3)
+ *  Author:     Reinhold P. Weicker
+ *              Siemens Nixdorf, Paderborn/Germany
+ *              weicker@specbench.org
+ *  Date:       May 25, 1988
+ *  Modified:	Steven Pemberton, CWI, Amsterdam; Steven.Pemberton@cwi.nl
+ *  Date:       October, 1993; March 1995
+ *              Included both files into one source, that gets compiled
+ *              in two passes. Made program auto-compiling, and auto-running,
+ *              and generally made it much easier to use.
+ *
+ *              Original Version (in Ada) published in
+ *              "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *              pp. 1013 - 1030, together with the statistics
+ *              on which the distribution of statements etc. is based.
+ *
+ *              In this C version, the following C library functions are used:
+ *              - strcpy, strcmp (inside the measurement loop)
+ *              - printf, scanf (outside the measurement loop)
+ *              In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *              are used for execution time measurement. For measurements
+ *              on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the compiler;
+ *      Dhrystone itself performs no OS calls in the measurement loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ ***************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *      -DREG          (default: Not defined)
+ *              As an approximation to what an average C programmer
+ *              might do, causes the "register" storage class to be applied
+ *              - for local variables, if they are used (dynamically)
+ *                five or more times
+ *              - for parameters if they are used (dynamically)
+ *                six or more times
+ *              Note that an optimal "register" strategy is
+ *              compiler-dependent, and that "register" declarations
+ *              do not necessarily lead to faster execution.
+ *      -DNOSTRUCTASSIGN        (default: Not defined)
+ *              Define if the C compiler does not support
+ *              assignment of structures.
+ *      -DNOENUMS               (default: Not defined)
+ *              Define if the C compiler does not support
+ *              enumeration types.
+ *      -DTIMES                 (default)
+ *      -DTIME
+ *              The "times" function of UNIX (returning process times)
+ *              or the "time" function (returning wallclock time)
+ *              is used for measurement.
+ *              For single user machines, "time ()" is adequate. For
+ *              multi-user machines where you cannot get single-user
+ *              access, use the "times ()" function. If you have
+ *              neither, use a stopwatch in the dead of night.
+ *              "printf"s are provided marking the points "Start Timer"
+ *              and "Stop Timer". DO NOT use the UNIX "time(1)"
+ *              command, as this will measure the total time to
+ *              run this program, which will (erroneously) include
+ *              the time to allocate storage (malloc) and to perform
+ *              the initialization.
+ *      -DHZ=nnn
+ *              In Berkeley UNIX, the function "times" returns process
+ *              time in 1/HZ seconds, with HZ = 60 for most systems.
+ *              CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY
+ *              A VALUE.
+ *
+ ***************************************************************************
+ *
+ *  History:	Version C/2.1 was made for two reasons:
+ *
+ *	1) There was an obvious need for a common C version of
+ *      Dhrystone, since C is at present the most popular system
+ *      programming language for the class of processors
+ *      (microcomputers, minicomputers) where Dhrystone is used most.
+ *      There should be, as far as possible, only one C version of
+ *      Dhrystone such that results can be compared without
+ *      restrictions. In the past, the C versions distributed
+ *      by Rick Richardson (Version 1.1) and by Reinhold Weicker
+ *      had small (though not significant) differences.
+ *
+ *      2) As far as it is possible without changes to the Dhrystone
+ *      statistics, optimizing compilers should be prevented from
+ *      removing significant statements.
+ *
+ *      This C version has been developed in cooperation with
+ *      Rick Richardson (Tinton Falls, NJ), it incorporates many
+ *      ideas from the "Version 1.1" distributed previously by
+ *      him over the UNIX network Usenet.
+ *      I also thank Chaim Benedelac (National Semiconductor),
+ *      David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
+ *      Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
+ *      for their help with comments on earlier versions of the
+ *      benchmark.
+ *
+ *  Changes:    In the initialization part, this version follows mostly
+ *      Rick Richardson's version distributed via Usenet, not the
+ *      version distributed earlier via floppy disk by Reinhold Weicker.
+ *      As a concession to older compilers, names have been made
+ *      unique within the first 8 characters.
+ *      Inside the measurement loop, this version follows the
+ *      version previously distributed by Reinhold Weicker.
+ *
+ *      At several places in the benchmark, code has been added,
+ *      but within the measurement loop only in branches that
+ *      are not executed. The intention is that optimizing compilers
+ *      should be prevented from moving code out of the measurement
+ *      loop, or from removing code altogether. Since the statements
+ *      that are executed within the measurement loop have NOT been
+ *      changed, the numbers defining the "Dhrystone distribution"
+ *      (distribution of statements, operand types and locality)
+ *      still hold. Except for sophisticated optimizing compilers,
+ *      execution times for this version should be the same as
+ *      for previous versions.
+ *
+ *      Since it has proven difficult to subtract the time for the
+ *      measurement loop overhead in a correct way, the loop check
+ *      has been made a part of the benchmark. This does have
+ *      an impact - though a very minor one - on the distribution
+ *      statistics which have been updated for this version.
+ *
+ *      All changes within the measurement loop are described
+ *      and discussed in the companion paper "Rationale for
+ *      Dhrystone version 2".
+ *
+ *      Because of the self-imposed limitation that the order and
+ *      distribution of the executed statements should not be
+ *      changed, there are still cases where optimizing compilers
+ *      may not generate code for some statements. To a certain
+ *      degree, this is unavoidable for small synthetic benchmarks.
+ *      Users of the benchmark are advised to check code listings
+ *      whether code is generated for all statements of Dhrystone.
+ *
+ *      Version 2.1 is identical to version 2.0 distributed via
+ *      the UNIX network Usenet in March 1988 except that it corrects
+ *      some minor deficiencies that were found by users of version 2.0.
+ *      The only change within the measurement loop is that a
+ *      non-executed "else" part was added to the "if" statement in
+ *      Func_3, and a non-executed "else" part removed from Proc_3.
+ *
+ * Version C/2.2, Steven Pemberton, October 1993
+ *	Functionally, identical to version 2.2; the changes are in
+ *	how you compile and use it:
+ *	- Everything is in one file now, but compiled in 2 passes
+ *	- Compile (and run) by running the file through the shell: 'sh dhry.c"
+ *	- Uses the system definition of HZ if one can be found
+ *	- HZ must be defined, otherwise it won't compile (no defaults here)
+ *	- The (uninteresting) output is printed to stderr (dhry2 > /dev/null)
+ *	- The number of loops is passed as a parameter, rather than read
+ *	  (dhry2 500000)
+ *	- If the number of loops is insufficient to get a good result,
+ *	  it repeats it with loops*10 until it is enough (rather than just
+ *	  stopping)
+ *	- Output says which sort of clock it is using, and the HZ value
+ *	- You can use -DREG instead of the -DREG=register of previous versions
+ *	- Some stylistic cleanups.
+ *
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *
+ *  Dhrystone (C version) statistics:
+ *
+ *  [Comment from the first distribution, updated for version 2.
+ *   Note that because of language differences, the numbers are slightly
+ *   different from the Ada version.]
+ *
+ *  The following program contains statements of a high level programming
+ *  language (here: C) in a distribution considered representative:
+ *
+ *    assignments                  52 (51.0 %)
+ *    control statements           33 (32.4 %)
+ *    procedure, function calls    17 (16.7 %)
+ *
+ *  103 statements are dynamically executed. The program is balanced with
+ *  respect to the three aspects:
+ *
+ *    - statement type
+ *    - operand type
+ *    - operand locality
+ *         operand global, local, parameter, or constant.
+ *
+ *  The combination of these three aspects is balanced only approximately.
+ *
+ *  1. Statement Type:
+ *  -----------------             number
+ *
+ *     V1 = V2                     9
+ *       (incl. V1 = F(..)
+ *     V = Constant               12
+ *     Assignment,                 7
+ *       with array element
+ *     Assignment,                 6
+ *       with record component
+ *                                --
+ *                                34       34
+ *
+ *     X = Y +|-|"&&"|"|" Z        5
+ *     X = Y +|-|"==" Constant     6
+ *     X = X +|- 1                 3
+ *     X = Y *|/ Z                 2
+ *     X = Expression,             1
+ *           two operators
+ *     X = Expression,             1
+ *           three operators
+ *                                --
+ *                                18       18
+ *
+ *     if ....                    14
+ *       with "else"      7
+ *       without "else"   7
+ *           executed        3
+ *           not executed    4
+ *     for ...                     7  |  counted every time
+ *     while ...                   4  |  the loop condition
+ *     do ... while                1  |  is evaluated
+ *     switch ...                  1
+ *     break                       1
+ *     declaration with            1
+ *       initialization
+ *                                --
+ *                                34       34
+ *
+ *     P (...)  procedure call    11
+ *       user procedure      10
+ *       library procedure    1
+ *     X = F (...)
+ *             function  call      6
+ *       user function        5
+ *       library function     1
+ *                                --
+ *                                17       17
+ *                                        ---
+ *                                        103
+ *
+ *    The average number of parameters in procedure or function calls
+ *    is 1.82 (not counting the function values aX *
+ *
+ *  2. Operators
+ *  ------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *    Arithmetic             32          50.8
+ *
+ *       +                     21          33.3
+ *       -                      7          11.1
+ *       *                      3           4.8
+ *       / (int div)            1           1.6
+ *
+ *    Comparison             27           42.8
+ *
+ *       ==                     9           14.3
+ *       /=                     4            6.3
+ *       >                      1            1.6
+ *       <                      3            4.8
+ *       >=                     1            1.6
+ *       <=                     9           14.3
+ *
+ *    Logic                   4            6.3
+ *
+ *       && (AND-THEN)          1            1.6
+ *       |  (OR)                1            1.6
+ *       !  (NOT)               2            3.2
+ *
+ *                           --          -----
+ *                           63          100.1
+ *
+ *
+ *  3. Operand Type (counted once per operand reference):
+ *  ---------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *     Integer               175        72.3 %
+ *     Character              45        18.6 %
+ *     Pointer                12         5.0 %
+ *     String30                6         2.5 %
+ *     Array                   2         0.8 %
+ *     Record                  2         0.8 %
+ *                           ---       -------
+ *                           242       100.0 %
+ *
+ *  When there is an access path leading to the final operand (e.g. a record
+ *  component), only the final data type on the access path is counted.
+ *
+ *
+ *  4. Operand Locality:
+ *  -------------------
+ *                                number    approximate
+ *                                          percentage
+ *
+ *     local variable              114        47.1 %
+ *     global variable              22         9.1 %
+ *     parameter                    45        18.6 %
+ *        value                        23         9.5 %
+ *        reference                    22         9.1 %
+ *     function result               6         2.5 %
+ *     constant                     55        22.7 %
+ *                                 ---       -------
+ *                                 242       100.0 %
+ *
+ *  The program does not compute anything meaningful, but it is syntactically
+ *  and semantically correct. All variables have a value assigned to them
+ *  before they are used as a source operand.
+ *
+ *  There has been no explicit effort to account for the effects of a
+ *  cache, or to balance the use of long or short displacements for code or
+ *  data.
+ *
+ ***************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+/* variables for time measurement: */
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
+#define Start_Timer() Begin_Time = uptime_ms()
+#define Stop_Timer()  End_Time   = uptime_ms()
+
+#define NUMBER_OF_RUNS		500000 /* Default number of runs */
+#define PASS2
+
+#ifdef  NOSTRUCTASSIGN
+#define structassign(d, s)      memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s)      d = s
+#endif
+
+#ifdef  NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+  typedef int   Enumeration;
+#else
+  typedef       enum    {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
+                Enumeration;
+#endif
+        /* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+
+#define Null 0
+                /* Value of a Null pointer */
+
+typedef int     One_Thirty;
+typedef int     One_Fifty;
+typedef char    Capital_Letter;
+typedef int     Boolean;
+typedef char    Str_30 [31];
+typedef int     Arr_1_Dim [50];
+typedef int     Arr_2_Dim [50] [50];
+
+typedef struct record
+    {
+    struct record *Ptr_Comp;
+    Enumeration    Discr;
+    union {
+          struct {
+                  Enumeration Enum_Comp;
+                  int         Int_Comp;
+                  char        Str_Comp [31];
+                  } var_1;
+          struct {
+                  Enumeration E_Comp_2;
+                  char        Str_2_Comp [31];
+                  } var_2;
+          struct {
+                  char        Ch_1_Comp;
+                  char        Ch_2_Comp;
+                  } var_3;
+          } variant;
+      } Rec_Type, *Rec_Pointer;
+
+/* Global Variables: */
+
+Rec_Pointer     Ptr_Glob,
+                Next_Ptr_Glob;
+int             Int_Glob;
+Boolean         Bool_Glob;
+char            Ch_1_Glob,
+                Ch_2_Glob;
+int             Arr_1_Glob [50];
+int             Arr_2_Glob [50] [50];
+
+Enumeration     Func_1 ();
+  /* forward declaration necessary since Enumeration may not simply be int */
+
+#ifndef REG
+        Boolean Reg = false;
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#else
+        Boolean Reg = true;
+#undef REG
+#define REG register
+#endif
+
+Boolean		Done;
+
+long            Begin_Time,
+                End_Time,
+                User_Time;
+float           Microseconds,
+                Dhrystones_Per_Second;
+
+/* end of variables for time measurement */
+
+static char memory[1024];
+static char *free_mem = &memory[0];
+
+static char* myalloc(size_t size) {
+  while ((unsigned long)free_mem % 4 != 0) free_mem ++;
+  char *ret = free_mem;
+  free_mem += size;
+  return ret;
+}
+
+void Proc_6 (Enumeration, Enumeration*);
+void Proc_3 (Rec_Pointer*);
+void Proc_7 (One_Fifty a, One_Fifty b, One_Fifty* c);
+Boolean Func_2 (Str_30, Str_30);
+void Proc_8(Arr_1_Dim, Arr_2_Dim, int, int);
+Boolean Func_3 (Enumeration);
+
+void Proc_1 (Ptr_Val_Par)
+/******************/
+
+REG Rec_Pointer Ptr_Val_Par;
+    /* executed once */
+{
+  REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+                                        /* == Ptr_Glob_Next */
+  /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+  /* corresponds to "rename" in Ada, "with" in Pascal           */
+
+  structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+  Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+  Next_Record->variant.var_1.Int_Comp
+        = Ptr_Val_Par->variant.var_1.Int_Comp;
+  Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+  Proc_3 (&Next_Record->Ptr_Comp);
+    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+                        == Ptr_Glob->Ptr_Comp */
+  if (Next_Record->Discr == Ident_1)
+    /* then, executed */
+  {
+    Next_Record->variant.var_1.Int_Comp = 6;
+    Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
+           &Next_Record->variant.var_1.Enum_Comp);
+    Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+    Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
+           &Next_Record->variant.var_1.Int_Comp);
+  }
+  else /* not executed */
+    structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+} /* Proc_1 */
+
+
+void Proc_2 (Int_Par_Ref)
+/******************/
+    /* executed once */
+    /* *Int_Par_Ref == 1, becomes 4 */
+
+One_Fifty   *Int_Par_Ref;
+{
+  One_Fifty  Int_Loc;
+  Enumeration   Enum_Loc;
+
+  Int_Loc = *Int_Par_Ref + 10;
+  do /* executed once */
+    if (Ch_1_Glob == 'A')
+      /* then, executed */
+    {
+      Int_Loc -= 1;
+      *Int_Par_Ref = Int_Loc - Int_Glob;
+      Enum_Loc = Ident_1;
+    } /* if */
+  while (Enum_Loc != Ident_1); /* true */
+} /* Proc_2 */
+
+
+void Proc_3 (Ptr_Ref_Par)
+/******************/
+    /* executed once */
+    /* Ptr_Ref_Par becomes Ptr_Glob */
+
+Rec_Pointer *Ptr_Ref_Par;
+
+{
+  if (Ptr_Glob != Null)
+    /* then, executed */
+    *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+  Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+} /* Proc_3 */
+
+
+void Proc_4 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Boolean Bool_Loc;
+
+  Bool_Loc = Ch_1_Glob == 'A';
+  Bool_Glob = Bool_Loc | Bool_Glob;
+  Ch_2_Glob = 'B';
+} /* Proc_4 */
+
+
+void Proc_5 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Ch_1_Glob = 'A';
+  Bool_Glob = false;
+} /* Proc_5 */
+
+
+        /* Procedure for the assignment of structures,          */
+        /* if the C compiler doesn't support this feature       */
+#ifdef  NOSTRUCTASSIGN
+memcpy (d, s, l)
+register char   *d;
+register char   *s;
+register int    l;
+{
+        while (l--) *d++ = *s++;
+}
+#endif
+
+
+#ifndef REG
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#else
+#undef REG
+#define REG register
+#endif
+
+extern  int     Int_Glob;
+extern  char    Ch_1_Glob;
+
+
+void Proc_6 (Enum_Val_Par, Enum_Ref_Par)
+/*********************************/
+    /* executed once */
+    /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+Enumeration  Enum_Val_Par;
+Enumeration *Enum_Ref_Par;
+{
+  *Enum_Ref_Par = Enum_Val_Par;
+  if (! Func_3 (Enum_Val_Par))
+    /* then, not executed */
+    *Enum_Ref_Par = Ident_4;
+  switch (Enum_Val_Par)
+  {
+    case Ident_1:
+      *Enum_Ref_Par = Ident_1;
+      break;
+    case Ident_2:
+      if (Int_Glob > 100)
+        /* then */
+      *Enum_Ref_Par = Ident_1;
+      else *Enum_Ref_Par = Ident_4;
+      break;
+    case Ident_3: /* executed */
+      *Enum_Ref_Par = Ident_2;
+      break;
+    case Ident_4: break;
+    case Ident_5:
+      *Enum_Ref_Par = Ident_3;
+      break;
+  } /* switch */
+} /* Proc_6 */
+
+
+void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref)
+{
+  One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 2;
+  *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+} /* Proc_7 */
+
+
+void Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
+/*********************************************************************/
+    /* executed once      */
+    /* Int_Par_Val_1 == 3 */
+    /* Int_Par_Val_2 == 7 */
+Arr_1_Dim       Arr_1_Par_Ref;
+Arr_2_Dim       Arr_2_Par_Ref;
+int             Int_1_Par_Val;
+int             Int_2_Par_Val;
+{
+  REG One_Fifty Int_Index;
+  REG One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 5;
+  Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
+  Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
+  Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
+  for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
+    Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
+  Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
+  Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
+  Int_Glob = 5;
+} /* Proc_8 */
+
+
+Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
+/*************************************************/
+    /* executed three times                                         */
+    /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+    /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+    /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+
+Capital_Letter   Ch_1_Par_Val;
+Capital_Letter   Ch_2_Par_Val;
+{
+  Capital_Letter        Ch_1_Loc;
+  Capital_Letter        Ch_2_Loc;
+
+  Ch_1_Loc = Ch_1_Par_Val;
+  Ch_2_Loc = Ch_1_Loc;
+  if (Ch_2_Loc != Ch_2_Par_Val)
+    /* then, executed */
+    return (Ident_1);
+  else  /* not executed */
+  {
+    Ch_1_Glob = Ch_1_Loc;
+    return (Ident_2);
+   }
+} /* Func_1 */
+
+
+Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
+/*************************************************/
+    /* executed once */
+    /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+    /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+Str_30  Str_1_Par_Ref;
+Str_30  Str_2_Par_Ref;
+{
+  REG One_Thirty        Int_Loc;
+      Capital_Letter    Ch_Loc;
+
+  Int_Loc = 2;
+  while (Int_Loc <= 2) /* loop body executed once */
+    if (Func_1 (Str_1_Par_Ref[Int_Loc],
+                Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
+      /* then, executed */
+    {
+      Ch_Loc = 'A';
+      Int_Loc += 1;
+    } /* if, while */
+  if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+    /* then, not executed */
+    Int_Loc = 7;
+  if (Ch_Loc == 'R') {
+    /* then, not executed */
+    return (true);
+  }
+  else /* executed */
+  {
+    if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+    {
+      Int_Loc += 7;
+      Int_Glob = Int_Loc;
+      return (true);
+    }
+    else /* executed */
+      return (false);
+  } /* if Ch_Loc */
+} /* Func_2 */
+
+
+Boolean Func_3 (Enum_Par_Val)
+/***************************/
+    /* executed once        */
+    /* Enum_Par_Val == Ident_3 */
+Enumeration Enum_Par_Val;
+{
+  Enumeration Enum_Loc;
+
+  Enum_Loc = Enum_Par_Val;
+  if (Enum_Loc == Ident_3)
+    /* then, executed */
+    return (true);
+  else /* not executed */
+    return (false);
+} /* Func_3 */
+
+
+Boolean pass = true;
+Boolean check(int cond) {
+  if (!cond) pass = false;
+  return cond;
+}
+int main ()
+/*****/
+
+  /* main program, corresponds to procedures        */
+  /* Main and Proc_0 in the Ada version             */
+{
+        One_Fifty       Int_1_Loc;
+  REG   One_Fifty       Int_2_Loc;
+        One_Fifty       Int_3_Loc;
+  REG   char            Ch_Index;
+        Enumeration     Enum_Loc;
+        Str_30          Str_1_Loc;
+        Str_30          Str_2_Loc;
+  REG   int             Run_Index;
+  REG   int             Number_Of_Runs;
+
+  ioe_init();
+
+ Number_Of_Runs = NUMBER_OF_RUNS;
+
+  /* Initializations */
+
+  Next_Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
+  Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
+
+  Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
+  Ptr_Glob->Discr                       = Ident_1;
+  Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
+  Ptr_Glob->variant.var_1.Int_Comp      = 40;
+  strcpy (Ptr_Glob->variant.var_1.Str_Comp,
+          "DHRYSTONE PROGRAM, SOME STRING");
+  strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+  Arr_2_Glob [8][7] = 10;
+        /* Was missing in published program. Without this statement,    */
+        /* Arr_2_Glob [8][7] would have an undefined value.             */
+        /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */
+        /* overflow may occur for this array element.                   */
+
+  printf ("Dhrystone Benchmark, Version %s\n", Version);
+
+  Done = false;
+  while (!Done) {
+
+    printf ("Trying %d runs through Dhrystone.\n", Number_Of_Runs);
+
+    /***************/
+    /* Start timer */
+    /***************/
+
+    Start_Timer();
+
+    for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
+    {
+
+      Proc_5();
+      Proc_4();
+	/* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+      Int_1_Loc = 2;
+      Int_2_Loc = 3;
+      strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+      Enum_Loc = Ident_2;
+      Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
+	/* Bool_Glob == 1 */
+      while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
+      {
+	Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+	  /* Int_3_Loc == 7 */
+	Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+	  /* Int_3_Loc == 7 */
+	Int_1_Loc += 1;
+      } /* while */
+	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+      Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+	/* Int_Glob == 5 */
+      Proc_1 (Ptr_Glob);
+      for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+			       /* loop body executed twice */
+      {
+	if (Enum_Loc == Func_1 (Ch_Index, 'C'))
+	    /* then, not executed */
+	  {
+	  Proc_6 (Ident_1, &Enum_Loc);
+	  strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+	  Int_2_Loc = Run_Index;
+	  Int_Glob = Run_Index;
+	  }
+      }
+	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+      Int_2_Loc = Int_2_Loc * Int_1_Loc;
+      Int_1_Loc = Int_2_Loc / Int_3_Loc;
+      Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+	/* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+      Proc_2 (&Int_1_Loc);
+	/* Int_1_Loc == 5 */
+
+    } /* loop "for Run_Index" */
+
+    /**************/
+    /* Stop timer */
+    /**************/
+
+    Stop_Timer();
+
+    User_Time = End_Time - Begin_Time;
+
+    Done = true;
+  }
+
+  if (!check(Int_Glob == 5)) {
+    printf("Int_Glob:            %d\n", Int_Glob);
+    printf("        should be:   %d\n", 5);
+  }
+  if (!check(Bool_Glob == 1)) {
+    printf("Bool_Glob:           %d\n", Bool_Glob);
+    printf("        should be:   %d\n", 1);
+  }
+  if (!check(Ch_1_Glob == 'A')) {
+    printf("Ch_1_Glob:           %c\n", Ch_1_Glob);
+    printf("        should be:   %c\n", 'A');
+  }
+  if (!check(Ch_2_Glob == 'B')) {
+    printf("Ch_2_Glob:           %c\n", Ch_2_Glob);
+    printf("        should be:   %c\n", 'B');
+  }
+  if (!check(Arr_1_Glob[8] == 7)) {
+    printf("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);
+    printf("        should be:   %d\n", 7);
+  }
+  if (!check(Arr_2_Glob[8][7] == Number_Of_Runs + 10)) {
+    printf("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);
+    printf("        should be:   Number_Of_Runs + 10\n");
+  }
+
+  if (!check((int)Ptr_Glob->Discr == 0)) {
+    printf("Ptr_Glob->Discr:             %d\n", Ptr_Glob->Discr);
+    printf("        should be:   %d\n", 0);
+  }
+  if (!check(Ptr_Glob->variant.var_1.Enum_Comp == 2)) {
+    printf("Ptr_Glob->Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+    printf("        should be:   %d\n", 2);
+  }
+  if (!check(Ptr_Glob->variant.var_1.Int_Comp == 17)) {
+    printf("Ptr_Glob->Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);
+    printf("        should be:   %d\n", 17);
+  }
+  if (!check(strcmp(Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
+    printf("Ptr_Glob->Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);
+    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  }
+
+  if (!check((int)Next_Ptr_Glob->Discr == 0)) {
+    printf("Next_Ptr_Glob->Discr:             %d\n", Next_Ptr_Glob->Discr);
+    printf("        should be:   %d\n", 0);
+  }
+  if (!check(Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)) {
+    printf("Next_Ptr_Glob->Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+    printf("        should be:   %d\n", 1);
+  }
+  if (!check(Next_Ptr_Glob->variant.var_1.Int_Comp == 18)) {
+    printf("Next_Ptr_Glob->Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
+    printf("        should be:   %d\n", 18);
+  }
+  if (!check(strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
+    printf("Next_Ptr_Glob->Str_Comp:          %s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  }
+
+  if (!check(Int_1_Loc == 5)) {
+    printf("Int_1_Loc:           %d\n", Int_1_Loc);
+    printf("        should be:   %d\n", 5);
+  }
+  if (!check(Int_2_Loc == 13)) {
+    printf("Int_2_Loc:           %d\n", Int_2_Loc);
+    printf("        should be:   %d\n", 13);
+  }
+  if (!check(Int_3_Loc == 7)) {
+    printf("Int_3_Loc:           %d\n", Int_3_Loc);
+    printf("        should be:   %d\n", 7);
+  }
+  if (!check(Enum_Loc == 1)) {
+    printf("Enum_Loc:            %d\n", Enum_Loc);
+    printf("        should be:   %d\n", 1);
+  }
+
+  if (!check(strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)) {
+    printf("Str_1_Loc:           %s\n", Str_1_Loc);
+    printf("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");
+  }
+  if (!check(strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)) {
+    printf("Str_2_Loc:           %s\n", Str_2_Loc);
+    printf("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");
+  }
+
+  printf ("Finished in %d ms\n", (int)User_Time);
+  printf("==================================================\n");
+  printf("Dhrystone %s         %d Marks\n", pass ? "PASS" : "FAIL",
+      880900 / (int)User_Time * NUMBER_OF_RUNS/ 500000);
+  printf("                   vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
+
+  return 0;
+}
+
+
--- a/microbench/Makefile
+++ b/microbench/Makefile
@ -0,0 +1,3 @@
+NAME = microbench
+SRCS = $(shell find -L ./src/ -name "*.c" -o -name "*.cc")
+include $(AM_HOME)/Makefile
--- a/microbench/include/benchmark.h
+++ b/microbench/include/benchmark.h
@ -0,0 +1,113 @@
+#ifndef __BENCHMARK_H__
+#define __BENCHMARK_H__
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MB * 1024 * 1024
+#define KB * 1024
+
+#define REF_CPU    "i7-7700K @ 4.20GHz"
+#define REF_SCORE  100000
+
+#define REPEAT  1
+
+//                  size |  heap | time |  checksum
+#define QSORT_S {     100,   1 KB,     0, 0x08467105}
+#define QSORT_M {   30000, 128 KB,     0, 0xa3e99fe4}
+#define QSORT_L {  100000, 640 KB,  5114, 0xed8cff89}
+#define QUEEN_S {       8,   0 KB,     0, 0x0000005c}
+#define QUEEN_M {      11,   0 KB,     0, 0x00000a78}
+#define QUEEN_L {      12,   0 KB,  4707, 0x00003778}
+#define    BF_S {       4,  32 KB,     0, 0xa6f0079e}
+#define    BF_M {      25,  32 KB,     0, 0xa88f8a65}
+#define    BF_L {     180,  32 KB, 23673, 0x9221e2b3}
+#define   FIB_S {       2,   1 KB,     0, 0x7cfeddf0}
+#define   FIB_M {      23,  16 KB,     0, 0x94ad8800}
+#define   FIB_L {      91, 256 KB, 28318, 0xebdc5f80}
+#define SIEVE_S {     100,   1 KB,     0, 0x00000019}
+#define SIEVE_M {  200000,  32 KB,     0, 0x00004640}
+#define SIEVE_L {10000000,   2 MB, 39361, 0x000a2403}
+#define  PZ15_S {       0,   1 KB,     0, 0x00000006}
+#define  PZ15_M {       1, 256 KB,     0, 0x0000b0df}
+#define  PZ15_L {       2,   2 MB,  4486, 0x00068b8c}
+#define DINIC_S {      10,   8 KB,     0, 0x0000019c}
+#define DINIC_M {      80, 512 KB,     0, 0x00004f99}
+#define DINIC_L {     128,   1 MB, 10882, 0x0000c248}
+#define  LZIP_S {     128, 128 KB,     0, 0xe05fc832}
+#define  LZIP_M {   50000,   1 MB,     0, 0xdc93e90c}
+#define  LZIP_L { 1048576,   4 MB,  7593, 0x8d62c81f}
+#define SSORT_S {     100,   4 KB,     0, 0x4c555e09}
+#define SSORT_M {   10000, 512 KB,     0, 0x0db7909b}
+#define SSORT_L {  100000,   4 MB,  4504, 0x4f0ab431}
+#define   MD5_S {     100,   1 KB,     0, 0xf902f28f}
+#define   MD5_M {  200000, 256 KB,     0, 0xd4f9bc6d}
+#define   MD5_L {10000000,  16 MB, 17239, 0x27286a42}
+
+#define BENCHMARK_LIST(def) \
+  def(qsort, "qsort", QSORT_S, QSORT_M, QSORT_L, "Quick sort") \
+  def(queen, "queen", QUEEN_S, QUEEN_M, QUEEN_L, "Queen placement") \
+  def(   bf,    "bf",    BF_S,    BF_M,    BF_L, "Brainf**k interpreter") \
+  def(  fib,   "fib",   FIB_S,   FIB_M,   FIB_L, "Fibonacci number") \
+  def(sieve, "sieve", SIEVE_S, SIEVE_M, SIEVE_L, "Eratosthenes sieve") \
+  def( 15pz,  "15pz",  PZ15_S,  PZ15_M,  PZ15_L, "A* 15-puzzle search") \
+  def(dinic, "dinic", DINIC_S, DINIC_M, DINIC_L, "Dinic's maxflow algorithm") \
+  def( lzip,  "lzip",  LZIP_S,  LZIP_M,  LZIP_L, "Lzip compression") \
+  def(ssort, "ssort", SSORT_S, SSORT_M, SSORT_L, "Suffix sort") \
+  def(  md5,   "md5",   MD5_S,   MD5_M,   MD5_L, "MD5 digest") \
+
+// Each benchmark will run REPEAT times
+
+#define DECL(_name, _sname, _s, _m, _l, _desc) \
+  void bench_##_name##_prepare(); \
+  void bench_##_name##_run(); \
+  int bench_##_name##_validate();
+
+BENCHMARK_LIST(DECL)
+
+typedef struct Setting {
+  int size;
+  unsigned long mlim, ref;
+  uint32_t checksum;
+} Setting;
+
+typedef struct Benchmark {
+  void (*prepare)();
+  void (*run)();
+  int (*validate)();
+  const char *name, *desc;
+  Setting settings[3];
+} Benchmark;
+
+extern Benchmark *current;
+extern Setting *setting;
+
+typedef struct Result {
+  int pass;
+  unsigned long tsc, msec;
+} Result;
+
+void prepare(Result *res);
+void done(Result *res);
+
+// memory allocation
+void* bench_alloc(size_t size);
+void bench_free(void *ptr);
+
+// random number generator
+void bench_srand(uint32_t seed);
+uint32_t bench_rand(); // return a random number between 0..32767
+
+// checksum
+uint32_t checksum(void *start, void *end);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/microbench/src/15pz/15pz.cc
+++ b/microbench/src/15pz/15pz.cc
@ -0,0 +1,88 @@
+#include <benchmark.h>
+#include "puzzle.h"
+#include "heap.h"
+
+const int N = 4;
+
+static int PUZZLE_S[N*N] = {
+  1, 2, 3, 4,
+  5, 6, 7, 8,
+  9, 10, 0, 11,
+  13, 14, 15, 12,
+};
+
+static int PUZZLE_M[N*N] = {
+  1, 2, 3, 4,
+  5, 6, 7, 8,
+  12, 0, 14, 13,
+  11, 15, 10, 9,
+};
+
+static int PUZZLE_L[N*N] = {
+  0, 2, 3, 4,
+  9, 6, 7, 8,
+  5, 11, 10, 12,
+  1, 15, 13, 14,
+};
+
+static int ans;
+
+extern "C" {
+
+void bench_15pz_prepare() {
+}
+
+void bench_15pz_run() {
+  N_puzzle<N> puzzle;
+  int MAXN;
+
+  switch (setting->size) {
+    case 0: puzzle = N_puzzle<N>(PUZZLE_S); MAXN = 10; break;
+    case 1: puzzle = N_puzzle<N>(PUZZLE_M); MAXN = 2048; break;
+    case 2: puzzle = N_puzzle<N>(PUZZLE_L); MAXN = 16384; break;
+    default: assert(0);
+  }
+  assert(puzzle.solvable());
+
+  auto *heap = (Updatable_heap<N_puzzle<N>> *) bench_alloc(sizeof(Updatable_heap<N_puzzle<N>>));
+  heap->init(MAXN);
+  heap->push( puzzle, 0 );
+
+  int n = 0;
+  ans = -1;
+
+  while( heap->size() != 0 && n != MAXN ) {
+    N_puzzle<N> top = heap->pop();
+    ++n;
+
+    if ( top == N_puzzle<N>::solution() ) {
+      // We are done
+      ans = heap->length(top) * n;
+      return;
+    }
+
+    if ( top.tile_left_possible() ) {
+      heap->push( top.tile_left(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_right_possible() ) {
+      heap->push( top.tile_right(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_up_possible() ) {
+      heap->push( top.tile_up(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_down_possible() ) {
+      heap->push( top.tile_down(), heap->length( top ) + 1 );
+    }
+  }
+}
+
+
+int bench_15pz_validate() {
+  return (uint32_t)ans == setting->checksum;
+}
+
+}
+
--- a/microbench/src/15pz/heap.h
+++ b/microbench/src/15pz/heap.h
@ -0,0 +1,227 @@
+// Author:  Douglas Wilhelm Harder
+// Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
+
+template <typename T>
+T max(T a, T b) {
+  return a > b ? a : b;
+}
+
+template <typename T>
+class Updatable_heap {
+  private:
+    int M;
+    class Step;
+    Step **hash_table;
+    Step **heap;
+    int heap_size;
+    int maximum_heap_size;
+
+    void inline swap( int, int );
+    void percolate_down();
+    void percolate_up( int );
+    Step *pointer( T const & ) const;
+
+  public:
+    void init(int m);
+    ~Updatable_heap();
+    T pop();
+    void push( T const &, int );
+    int size() const;
+    int maximum_size() const;
+    int length( T const & ) const;
+};
+
+template <typename T>
+class Updatable_heap<T>::Step {
+  public:
+    T element;
+    Step *next;
+    int heap_index;
+    int path_length;
+    int path_weight;
+    bool visited;
+    Step *previous_step;
+
+    void init( T const &, Step *, int, int );
+    int length() const;
+    int weight() const;
+};
+
+template <typename T>
+void Updatable_heap<T>::init(int m) {
+  M = m;
+  heap = (Step **)bench_alloc(sizeof(void *) * M);
+  hash_table = (Step **)bench_alloc(sizeof(void *) * (M + 1));
+
+  heap_size = 0;
+  maximum_heap_size = 0;
+  for ( int i = 0; i < M; ++i ) {
+    hash_table[i] = 0;
+  }
+}
+
+template <typename T>
+Updatable_heap<T>::~Updatable_heap() {
+  for ( int i = 0; i < M; ++i ) {
+    Step *ptr = hash_table[i];
+
+    while ( ptr != 0 ) {
+      Step *tmp = ptr;
+      ptr = ptr->next;
+    }
+  }
+}
+
+template <typename T>
+T Updatable_heap<T>::pop() {
+  if ( size() == 0 ) {
+    return T();
+  }
+
+  T top = heap[1]->element;
+
+  if ( size() == 1 ) {
+    heap_size = 0;
+  } else {
+    assert( size() > 1 );
+
+    heap[1] = heap[size()];
+    heap[1]->heap_index = 1;
+
+    --heap_size;
+    percolate_down();
+  }
+
+  return top;
+}
+
+template <typename T>
+void inline Updatable_heap<T>::swap( int i, int j ) {
+  Step *tmp = heap[j];
+  heap[j] = heap[i];
+  heap[i] = tmp;
+
+  heap[i]->heap_index = i;
+  heap[j]->heap_index = j;
+}
+
+template <typename T>
+void Updatable_heap<T>::percolate_down() {
+  int n = 1;
+
+  while ( 2*n + 1 <= size() ) {
+    if ( heap[n]->weight() < heap[2*n]->weight() && heap[n]->weight() < heap[2*n + 1]->weight() ) {
+      return;
+    }
+
+    if ( heap[2*n]->weight() < heap[2*n + 1]->weight() ) {
+      swap( n, 2*n );
+      n = 2*n;
+    } else {
+      assert( heap[2*n]->weight() >= heap[2*n + 1]->weight() );
+
+      swap( n, 2*n + 1 );
+      n = 2*n + 1;
+    }
+  }
+
+  if ( 2*n == size() &&  heap[2*n]->weight() < heap[n]->weight() ) {
+    swap( n, 2*n );
+  }
+}
+
+template <typename T>
+void Updatable_heap<T>::percolate_up( int n ) {
+  while ( n != 1 ) {
+    int parent = n/2;
+
+    if ( heap[parent]->weight() > heap[n]->weight() ) {
+      swap( parent, n );
+      n = parent;
+    } else {
+      return;
+    }
+  }
+}
+
+template <typename T>
+void Updatable_heap<T>::push( T const &pz, int path_length ) {
+  Step *ptr = pointer( pz );
+
+  if ( ptr == 0 ) {
+    assert( heap_size <= M );
+    ++heap_size;
+
+    Step *ptr = (Step*)bench_alloc(sizeof(Step));
+    ptr->init( pz, hash_table[pz.hash() & (M - 1)], size(), path_length );
+    hash_table[pz.hash() & (M - 1)] = ptr;
+    heap[size()] = ptr;
+
+    percolate_up( size() );
+
+    maximum_heap_size = max( maximum_heap_size, size() );
+  } else {
+    if ( !ptr->visited ) {
+      if ( path_length + ptr->element.lower_bound() < ptr->weight() ) {
+        ptr->path_weight = path_length + ptr->element.lower_bound();
+        percolate_up( ptr->heap_index );
+      }
+    }
+  }
+}
+
+template <typename T>
+int Updatable_heap<T>::size() const {
+  return heap_size;
+}
+
+template <typename T>
+int Updatable_heap<T>::maximum_size() const {
+  return maximum_heap_size;
+}
+
+template <typename T>
+int Updatable_heap<T>::length( T const &pz ) const {
+  Step *ptr = pointer( pz );
+
+  return ( ptr == 0 ) ? 2147483647 : ptr->length();
+}
+
+template <typename T>
+typename Updatable_heap<T>::Step *Updatable_heap<T>::pointer( T const &pz ) const {
+  for ( Step *ptr = hash_table[pz.hash() & (M - 1)]; ptr != 0; ptr = ptr->next ) {
+    if ( ptr->element == pz ) {
+      return ptr;
+    }
+  }
+
+  return 0;
+}
+
+/****************************************************
+ * ************************************************ *
+ * *                   Iterator                   * *
+ * ************************************************ *
+ ****************************************************/
+
+template <typename T>
+void Updatable_heap<T>::Step::init( T const &pz, Step *n, int hi, int dist ) {
+  element = pz;
+  next = n;
+  heap_index = hi;
+  path_length = dist;
+  path_weight = dist + element.lower_bound();
+  visited = false;
+  previous_step = 0;
+}
+
+template <typename T>
+int Updatable_heap<T>::Step::length() const {
+  return path_length;
+}
+
+template <typename T>
+int Updatable_heap<T>::Step::weight() const {
+  return path_weight;
+}
+
--- a/microbench/src/15pz/puzzle.h
+++ b/microbench/src/15pz/puzzle.h
@ -0,0 +1,475 @@
+// Author:  Douglas Wilhelm Harder
+// Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
+// Url: https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/N_puzzles/
+
+template <int N>
+class N_puzzle {
+  private:
+    bool puzzle_valid;
+    uint8_t zero_i, zero_j;
+    int8_t manhattan_distance;
+    int8_t puzzle[N][N];
+    int hash_value;
+
+    void determine_hash();
+
+    static int abs( int n ) { return ( n < 0 ) ? -n : n; }
+
+  public:
+    N_puzzle();
+    N_puzzle( int array[N*N] );
+    N_puzzle( N_puzzle const & );
+    N_puzzle &operator=( N_puzzle const & );
+
+    bool solvable() const;
+    bool valid() const;
+    int lower_bound() const;
+    unsigned int hash() const;
+
+    bool tile_up_possible() const;
+    bool tile_down_possible() const;
+    bool tile_left_possible() const;
+    bool tile_right_possible() const;
+
+    N_puzzle tile_up() const;
+    N_puzzle tile_down() const;
+    N_puzzle tile_left() const;
+    N_puzzle tile_right() const;
+
+    bool operator==( N_puzzle const & ) const;
+    bool operator!=( N_puzzle const & ) const;
+
+    N_puzzle static solution();
+};
+
+template < int N >
+N_puzzle<N>::N_puzzle():
+puzzle_valid( true ),
+manhattan_distance( 0 ) {
+  int array[N*N];
+
+  for ( int i = 0; i < N*N; ++i ) {
+    array[i] = i;
+  }
+
+  int n = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      int k = bench_rand() % (N*N - n);
+      puzzle[i][j] = array[k];
+
+      if ( array[k] == 0 ) {
+        zero_i = i;
+        zero_j = j;
+      } else {
+        manhattan_distance += abs( ((array[k] - 1) / N) - i );
+        manhattan_distance += abs( ((array[k] - 1) % N) - j );
+      }
+
+      ++n;
+      array[k] = array[N*N - n];
+    }
+  }
+
+  determine_hash();
+}
+
+template < int N >
+N_puzzle<N>::N_puzzle( int array[N*N] ):
+puzzle_valid( true ),
+manhattan_distance( 0 ) {
+  bool check[N*N];
+
+  for ( int i = 0; i < N*N; ++i ) {
+    check[i] = false;
+  }
+
+  int n = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = array[n];
+      check[array[n]] = true;
+
+      if ( array[n] == 0 ) {
+        zero_i = i;
+        zero_j = j;
+      } else {
+        manhattan_distance += abs( ((array[n] - 1) / N) - i );
+        manhattan_distance += abs( ((array[n] - 1) % N) - j );
+      }
+
+      ++n;
+    }
+  }
+
+  for ( int i = 0; i < N*N; ++i ) {
+    if ( !check[i] ) {
+      puzzle_valid = false;
+      return;
+    }
+  }
+
+  determine_hash();
+}
+
+/*
+ * Determine a hash value for the puzzle.
+ */
+
+template < int N >
+void N_puzzle<N>::determine_hash() {
+  hash_value = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      hash_value = hash_value*1973 + puzzle[i][j];
+    }
+  }
+}
+
+template < int N >
+N_puzzle<N>::N_puzzle( N_puzzle const &pz ):
+puzzle_valid( pz.puzzle_valid ),
+zero_i( pz.zero_i ),
+zero_j( pz.zero_j ),
+manhattan_distance( pz.manhattan_distance ),
+hash_value( pz.hash_value ) {
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = pz.puzzle[i][j];
+    }
+  }
+}
+
+template < int N >
+N_puzzle<N> &N_puzzle<N>::operator=( N_puzzle const &rhs ) {
+  puzzle_valid = rhs.puzzle_valid;
+  zero_i = rhs.zero_i;
+  zero_j = rhs.zero_j;
+  manhattan_distance = rhs.manhattan_distance;
+  hash_value = rhs.hash_value;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = rhs.puzzle[i][j];
+    }
+  }
+  return *this;
+}
+
+
+/*
+ *  Moving a tile up is possible as long as
+ *  the blank is not in the last row.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_up_possible() const {
+  return puzzle_valid && (zero_i != N - 1);
+}
+
+/*
+ *  Moving a tile down is possible as long as
+ *  the blank is not in the first row.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_down_possible() const {
+  return puzzle_valid && (zero_i != 0);
+}
+
+/*
+ *  Moving a tile left is possible as long as
+ *  the blank is not in the last column.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_left_possible() const {
+  return puzzle_valid && (zero_j != N - 1);
+}
+
+/*
+ *  Moving a tile right is possible as long as
+ *  the blank is not in the first column.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_right_possible() const {
+  return puzzle_valid && (zero_j != 0);
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_up() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_i == N - 1 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - zero_i ) -
+    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - (zero_i + 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i + 1][zero_j];
+  ++result.zero_i;
+  result.puzzle[result.zero_i][zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_down() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_i == 0 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - zero_i ) -
+    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - (zero_i - 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i - 1][zero_j];
+  --result.zero_i;
+  result.puzzle[result.zero_i][zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_left() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_j == N - 1 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - zero_j ) -
+    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - (zero_j + 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j + 1];
+  ++result.zero_j;
+  result.puzzle[zero_i][result.zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_right() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_j == 0 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - zero_j ) -
+    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - (zero_j - 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j - 1];
+  --result.zero_j;
+  result.puzzle[zero_i][result.zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+/*
+ *  Check if the puzzle is solvable:  that is, check the
+ *  number of inversions pluse the Manhattan distance of
+ *  the black from the lower-right corner.
+ *
+ *  Run time:   O(n^2)
+ *  Memory:     O(n)
+ */
+
+template <int N>
+bool N_puzzle<N>::solvable() const {
+  if ( !valid() ) {
+    return false;
+  }
+
+  int entries[N*N];
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] == 0 ) {
+        entries[N*i + j] = N*N;
+      } else {
+        entries[N*i + j] = puzzle[i][j];
+      }
+    }
+  }
+
+  int parity = 0;
+
+  for ( int i = 0; i < N*N; ++i ) {
+    for ( int j = i + 1; j < N*N; ++j ) {
+      if ( entries[i] > entries[j] ) {
+        ++parity;
+      }
+    }
+  }
+
+  parity += 2*N - 2 - zero_i - zero_j;
+
+  return ( (parity & 1) == 0 );
+}
+
+template <int N>
+bool N_puzzle<N>::valid() const {
+  return puzzle_valid;
+}
+
+/*
+ *  Return either the Manhattan, Hamming, or discrete distance
+ *  between the puzzle and the solution.
+ */
+
+template <int N>
+int N_puzzle<N>::lower_bound() const {
+  // The Manhattan distance
+  return valid() ? manhattan_distance : N*N*N;
+
+  int result = 0;
+  int count = 1;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != (count % N*N) ) {
+        ++result;
+      }
+
+      ++count;
+    }
+  }
+
+  // The Hamming distance, or
+  return result;
+
+  // The discrete distance:  converts the A* search to Dijkstra's algorithm
+  // return ( result == 0 ) ? 0 : 1;
+}
+
+/*
+ *  puzzle1 == puzzle2
+ *
+ *  Two puzzles are considered to be equal if their entries
+ *  are equal:
+ *    If either puzzle is not valid, return false.
+ *    If the hash values are different, they are different; return false.
+ *    Otherwise, check all entries to see if they are the same.
+ */
+
+template < int N >
+bool N_puzzle<N>::operator==( N_puzzle const &rhs ) const {
+  if ( !valid() || !rhs.valid() || hash() != rhs.hash() ) {
+    return false;
+  }
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/*
+ *  puzzle1 != puzzle2
+ *
+ *  Two puzzles are considered to be unequal if any of the entries
+ *  different:
+ *    If either puzzle is not valid, return false.
+ *    If the hash values are different, they are different; return true.
+ *    Otherwise, check all entries to see if they are the same.
+ */
+
+template < int N >
+bool N_puzzle<N>::operator!=( N_puzzle const &rhs ) const {
+  if ( !valid() || !rhs.valid() ) {
+    return false;
+  }
+
+  if ( hash() != rhs.hash() ) {
+    return true;
+  }
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/*
+ * unsigned int hash() const
+ *
+ *   Returns the pre-calculated hash value.
+ */
+
+template < int N >
+unsigned int N_puzzle<N>::hash() const {
+  return valid() ? hash_value : 0;
+}
+
+/*
+ * N_puzzle<N>  solution()
+ *
+ *   Returns the correct solution to the N puzzle:
+ *
+ *       1  2  3         1   2   3   4
+ *  3x3: 4  5  6   4x4:  5   6   7   8
+ *       7  8            9  10  11  12
+ *                      13  14  15
+ */
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::solution() {
+  int array[N*N];
+
+  for ( int i = 0; i < N*N - 1; ++i ) {
+    array[i] = i + 1;
+  }
+
+  array[N*N - 1] = 0;
+
+  return N_puzzle<N>( array );
+}
+
--- a/microbench/src/bench.c
+++ b/microbench/src/bench.c
@ -0,0 +1,181 @@
+#include <am.h>
+#include <benchmark.h>
+#include <limits.h>
+#include <klib-macros.h>
+
+Benchmark *current;
+Setting *setting;
+
+static char *hbrk;
+
+static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
+
+// The benchmark list
+
+#define ENTRY(_name, _sname, _s, _m, _l, _desc) \
+  { .prepare = bench_##_name##_prepare, \
+    .run = bench_##_name##_run, \
+    .validate = bench_##_name##_validate, \
+    .name = _sname, \
+    .desc = _desc, \
+    .settings = {_s, _m, _l}, },
+
+Benchmark benchmarks[] = {
+  BENCHMARK_LIST(ENTRY)
+};
+
+// Running a benchmark
+static void bench_prepare(Result *res) {
+  res->msec = uptime_ms();
+}
+
+static void bench_reset() {
+  hbrk = (void *)ROUNDUP(heap.start, 8);
+}
+
+static void bench_done(Result *res) {
+  res->msec = uptime_ms() - res->msec;
+}
+
+static const char *bench_check(Benchmark *bench) {
+  uintptr_t freesp = (uintptr_t)heap.end - (uintptr_t)heap.start;
+  if (freesp < setting->mlim) {
+    return "(insufficient memory)";
+  }
+  return NULL;
+}
+
+static void run_once(Benchmark *b, Result *res) {
+  bench_reset();       // reset malloc state
+  current->prepare();  // call bechmark's prepare function
+  bench_prepare(res);  // clean everything, start timer
+  current->run();      // run it
+  bench_done(res);     // collect results
+  res->pass = current->validate();
+}
+
+static unsigned long score(Benchmark *b, unsigned long tsc, unsigned long msec) {
+  if (msec == 0) return 0;
+  return (REF_SCORE / 1000) * setting->ref / msec;
+}
+
+int main(const char *args) {
+  const char *setting_name = args;
+  if (args == NULL || strcmp(args, "") == 0) {
+    printf("Empty mainargs. Use \"ref\" by default\n");
+    setting_name = "ref";
+  }
+  int setting_id = -1;
+
+  if      (strcmp(setting_name, "test" ) == 0) setting_id = 0;
+  else if (strcmp(setting_name, "train") == 0) setting_id = 1;
+  else if (strcmp(setting_name, "ref"  ) == 0) setting_id = 2;
+  else {
+    printf("Invalid mainargs: \"%s\"; "
+           "must be in {test, train, ref}\n", setting_name);
+    halt(1);
+  }
+
+  ioe_init();
+
+  printf("======= Running MicroBench [input *%s*] =======\n", setting_name);
+
+  unsigned long bench_score = 0;
+  int pass = 1;
+  uint32_t t0 = uptime_ms();
+
+  for (int i = 0; i < LENGTH(benchmarks); i ++) {
+    Benchmark *bench = &benchmarks[i];
+    current = bench;
+    setting = &bench->settings[setting_id];
+    const char *msg = bench_check(bench);
+    printf("[%s] %s: ", bench->name, bench->desc);
+    if (msg != NULL) {
+      printf("Ignored %s\n", msg);
+    } else {
+      unsigned long msec = ULONG_MAX;
+      int succ = 1;
+      for (int i = 0; i < REPEAT; i ++) {
+        Result res;
+        run_once(bench, &res);
+        printf(res.pass ? "*" : "X");
+        succ &= res.pass;
+        if (res.msec < msec) msec = res.msec;
+      }
+
+      if (succ) printf(" Passed.");
+      else printf(" Failed.");
+
+      pass &= succ;
+
+      unsigned long cur = score(bench, 0, msec);
+
+      printf("\n");
+      if (setting_id != 0) {
+        printf("  min time: %d ms [%d]\n", (unsigned int)msec, (unsigned int)cur);
+      }
+
+      bench_score += cur;
+    }
+  }
+  uint32_t t1 = uptime_ms();
+
+  bench_score /= LENGTH(benchmarks);
+
+  printf("==================================================\n");
+  printf("MicroBench %s", pass ? "PASS" : "FAIL");
+  if (setting_id == 2) {
+    printf("        %d Marks\n", (unsigned int)bench_score);
+    printf("                   vs. %d Marks (%s)\n", REF_SCORE, REF_CPU);
+  } else {
+    printf("\n");
+  }
+  printf("Total time: %d ms\n", t1 - t0);
+  return 0;
+}
+
+// Libraries
+
+void* bench_alloc(size_t size) {
+  size  = (size_t)ROUNDUP(size, 8);
+  char *old = hbrk;
+  hbrk += size;
+  assert((uintptr_t)heap.start <= (uintptr_t)hbrk && (uintptr_t)hbrk < (uintptr_t)heap.end);
+  for (uint64_t *p = (uint64_t *)old; p != (uint64_t *)hbrk; p ++) {
+    *p = 0;
+  }
+  assert((uintptr_t)hbrk - (uintptr_t)heap.start <= setting->mlim);
+  return old;
+}
+
+void bench_free(void *ptr) {
+}
+
+static uint32_t seed = 1;
+
+void bench_srand(uint32_t _seed) {
+  seed = _seed & 0x7fff;
+}
+
+uint32_t bench_rand() {
+  seed = (seed * (uint32_t)214013L + (uint32_t)2531011L);
+  return (seed >> 16) & 0x7fff;
+}
+
+// FNV hash
+uint32_t checksum(void *start, void *end) {
+  const uint32_t x = 16777619;
+  uint32_t h1 = 2166136261u;
+  for (uint8_t *p = (uint8_t*)start; p + 4 < (uint8_t*)end; p += 4) {
+    for (int i = 0; i < 4; i ++) {
+      h1 = (h1 ^ p[i]) * x;
+    }
+  }
+  int32_t hash = (uint32_t)h1;
+  hash += hash << 13;
+  hash ^= hash >> 7;
+  hash += hash << 3;
+  hash ^= hash >> 17;
+  hash += hash << 5;
+  return hash;
+}
--- a/microbench/src/bf/bf.c
+++ b/microbench/src/bf/bf.c
@ -0,0 +1,151 @@
+/*
+ Brainfuck-C ( http://github.com/kgabis/brainfuck-c )
+ Copyright (c) 2012 Krzysztof Gabis
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <benchmark.h>
+
+static int ARR_SIZE;
+
+#define CODE            ">>+>>>>>,[>+>>,]>+[--[+<<<-]<[<+>-]<[<[->[<<<+>>>>+<-]<<[>>+>[->]<<[<]" \
+                        "<-]>]>>>+<[[-]<[>+<-]<]>[[>>>]+<<<-<[<<[<<<]>>+>[>>>]<-]<<[<<<]>[>>[>>" \
+                        ">]<+<<[<<<]>-]]+<<<]+[->>>]>>]>>[.>>>]"
+
+#define OP_END          0
+#define OP_INC_DP       1
+#define OP_DEC_DP       2
+#define OP_INC_VAL      3
+#define OP_DEC_VAL      4
+#define OP_OUT          5
+#define OP_IN           6
+#define OP_JMP_FWD      7
+#define OP_JMP_BCK      8
+
+#define SUCCESS         0
+#define FAILURE         1
+
+#define PROGRAM_SIZE    4096
+#define STACK_SIZE      512
+#define DATA_SIZE       4096
+
+#define STACK_PUSH(A)   (STACK[SP++] = A)
+#define STACK_POP()     (STACK[--SP])
+#define STACK_EMPTY()   (SP == 0)
+#define STACK_FULL()    (SP == STACK_SIZE)
+
+struct instruction_t {
+  unsigned short operator;
+  unsigned short operand;
+};
+
+static struct instruction_t *PROGRAM;
+static unsigned short *STACK;
+static unsigned int SP;
+static const char *code;
+static char *input;
+
+static int compile_bf() {
+  unsigned short pc = 0, jmp_pc;
+  for (; *code; code ++) {
+    int c = *code;
+    if (pc >= PROGRAM_SIZE) break;
+    switch (c) {
+      case '>': PROGRAM[pc].operator = OP_INC_DP; break;
+      case '<': PROGRAM[pc].operator = OP_DEC_DP; break;
+      case '+': PROGRAM[pc].operator = OP_INC_VAL; break;
+      case '-': PROGRAM[pc].operator = OP_DEC_VAL; break;
+      case '.': PROGRAM[pc].operator = OP_OUT; break;
+      case ',': PROGRAM[pc].operator = OP_IN; break;
+      case '[':
+        PROGRAM[pc].operator = OP_JMP_FWD;
+        if (STACK_FULL()) {
+          return FAILURE;
+        }
+        STACK_PUSH(pc);
+        break;
+      case ']':
+        if (STACK_EMPTY()) {
+          return FAILURE;
+        }
+        jmp_pc = STACK_POP();
+        PROGRAM[pc].operator = OP_JMP_BCK;
+        PROGRAM[pc].operand = jmp_pc;
+        PROGRAM[jmp_pc].operand = pc;
+        break;
+      default: pc--; break;
+    }
+    pc++;
+  }
+  if (!STACK_EMPTY() || pc == PROGRAM_SIZE) {
+    return FAILURE;
+  }
+  PROGRAM[pc].operator = OP_END;
+  return SUCCESS;
+}
+
+static unsigned short *data;
+static char *output;
+static int noutput;
+
+static void execute_bf() {
+  unsigned int pc = 0, ptr = 0;
+  while (PROGRAM[pc].operator != OP_END && ptr < DATA_SIZE) {
+    switch (PROGRAM[pc].operator) {
+      case OP_INC_DP: ptr++; break;
+      case OP_DEC_DP: ptr--; break;
+      case OP_INC_VAL: data[ptr]++; break;
+      case OP_DEC_VAL: data[ptr]--; break;
+      case OP_OUT: output[noutput ++] = data[ptr]; break;
+      case OP_IN: data[ptr] = *(input ++); break;
+      case OP_JMP_FWD: if(!data[ptr]) { pc = PROGRAM[pc].operand; } break;
+      case OP_JMP_BCK: if(data[ptr]) { pc = PROGRAM[pc].operand; } break;
+      default: return;
+    }
+    pc++;
+  }
+}
+
+void bench_bf_prepare() {
+  ARR_SIZE = setting->size;
+  SP = 0;
+  PROGRAM = bench_alloc(sizeof(PROGRAM[0]) * PROGRAM_SIZE);
+  STACK = bench_alloc(sizeof(STACK[0]) * STACK_SIZE);
+  data = bench_alloc(sizeof(data[0]) * DATA_SIZE);
+  code = CODE;
+  input = bench_alloc(ARR_SIZE + 1);
+  output = bench_alloc(DATA_SIZE);
+  noutput = 0;
+
+  bench_srand(1);
+  for (int i = 0; i < ARR_SIZE; i ++) {
+    input[i] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"[bench_rand() % 62];
+  }
+}
+
+void bench_bf_run() {
+  compile_bf();
+  execute_bf();
+}
+
+int bench_bf_validate() {
+  uint32_t cs = checksum(output, output + noutput);
+  return noutput == ARR_SIZE && cs == setting->checksum;
+}
--- a/microbench/src/dinic/dinic.cc
+++ b/microbench/src/dinic/dinic.cc
@ -0,0 +1,138 @@
+#include <benchmark.h>
+
+static int N;
+const int INF = 0x3f3f3f;
+
+struct Edge {
+  int from, to, cap, flow;
+  Edge(){}
+  Edge(int from, int to, int cap, int flow) {
+    this->from = from;
+    this->to = to;
+    this->cap = cap;
+    this->flow = flow;
+  }
+};
+
+template<typename T>
+static inline T min(T x, T y) {
+  return x < y ? x : y;
+}
+
+struct Dinic {
+  int n, m, s, t;
+  Edge *edges;
+  int *head, *nxt, *d, *cur, *queue;
+  bool *vis;
+
+  void init(int n) {
+    int nold = (n - 2) / 2;
+    int maxm = (nold * nold + nold * 2) * 2;
+
+    edges = (Edge *)bench_alloc(sizeof(Edge) * maxm);
+    head = (int *)bench_alloc(sizeof(int) * n);
+    nxt = (int *)bench_alloc(sizeof(int) * maxm);
+    vis = (bool *)bench_alloc(sizeof(bool) * n);
+    d = (int *)bench_alloc(sizeof(int) * n);
+    cur = (int *)bench_alloc(sizeof(int) * n);
+    queue = (int *)bench_alloc(sizeof(int) * n);
+
+    this->n = n;
+    for (int i = 0; i < n; i ++) {
+      head[i] = -1;
+    }
+    m = 0;
+  }
+
+  void AddEdge(int u, int v, int c) {
+    if (c == 0) return;
+    edges[m] = Edge(u, v, c, 0);
+    nxt[m] = head[u];
+    head[u] = m++;
+    edges[m] = Edge(v, u, 0, 0);
+    nxt[m] = head[v];
+    head[v] = m++;
+  }
+
+  bool BFS() {
+    for (int i = 0; i < n; i ++) vis[i] = 0;
+    int qf = 0, qr = 0;
+    queue[qr ++] = s;
+    d[s] = 0;
+    vis[s] = 1;
+    while (qf != qr) {
+      int x = queue[qf ++];
+      for (int i = head[x]; i != -1; i = nxt[i]) {
+        Edge& e = edges[i];
+        if (!vis[e.to] && e.cap > e.flow) {
+          vis[e.to] = 1;
+          d[e.to] = d[x] + 1;
+          queue[qr ++] = e.to;
+        }
+      }
+    }
+    return vis[t];
+  }
+
+  int DFS(int x, int a) {
+    if (x == t || a == 0) return a;
+    int flow = 0, f;
+    for (int i = cur[x]; i != -1; i = nxt[i]) {
+      Edge& e = edges[i];
+      if (d[x] + 1 == d[e.to] && (f = DFS(e.to, min(a, e.cap-e.flow))) > 0) {
+        e.flow += f;
+        edges[i^1].flow -= f;
+        flow += f;
+        a -= f;
+        if (a == 0) break;
+      }
+    }
+    return flow;
+  }
+
+  int Maxflow(int s, int t) {
+    this -> s = s; this -> t = t;
+    int flow = 0;
+    while (BFS()) {
+      for (int i = 0; i < n; i++)
+        cur[i] = head[i];
+      flow += DFS(s, INF);
+    }
+    return flow;
+  }
+};
+
+
+extern "C" {
+
+
+static Dinic *G;
+static int ans;
+
+void bench_dinic_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  int s = 2 * N, t = 2 * N + 1;
+  G = (Dinic*)bench_alloc(sizeof(Dinic));
+  G->init(2 * N + 2);
+  for (int i = 0; i < N; i ++)
+    for (int j = 0; j < N; j ++) {
+      G->AddEdge(i, N + j, bench_rand() % 10);
+    }
+
+  for (int i = 0; i < N; i ++) {
+    G->AddEdge(s, i, bench_rand() % 1000);
+    G->AddEdge(N + i, t, bench_rand() % 1000);
+  }
+}
+
+void bench_dinic_run() {
+  ans = G->Maxflow(2 * N, 2 * N + 1);
+}
+
+int bench_dinic_validate() {
+  return (uint32_t)ans == setting->checksum;
+}
+}
+
+
--- a/microbench/src/fib/fib.c
+++ b/microbench/src/fib/fib.c
@ -0,0 +1,64 @@
+#include <benchmark.h>
+
+// f(n) = (f(n-1) + f(n-2) + .. f(n-m)) mod 2^32
+
+#define N 2147483603
+static int M;
+
+static void put(uint32_t *m, int i, int j, uint32_t data) {
+  m[i * M + j] = data;
+}
+
+static uint32_t get(uint32_t *m, int i, int j) {
+  return m[i * M + j];
+}
+
+static inline void mult(uint32_t *c, uint32_t *a, uint32_t *b) {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++) {
+      put(c, i, j, 0);
+      for (int k = 0; k < M; k ++) {
+        put(c, i, j, get(c, i, j) + get(a, i, k) * get(b, k, j));
+      }
+    }
+}
+
+static inline void assign(uint32_t *a, uint32_t *b) {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++)
+      put(a, i, j, get(b, i, j));
+}
+
+static uint32_t *A, *ans, *T, *tmp;
+
+void bench_fib_prepare() {
+  M = setting->size;
+  int sz = sizeof(uint32_t) * M * M;
+  A = bench_alloc(sz);
+  T = bench_alloc(sz);
+  ans = bench_alloc(sz);
+  tmp = bench_alloc(sz);
+}
+
+void bench_fib_run() {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++) {
+      uint32_t x = (i == M - 1 || j == i + 1);
+      put(A, i, j, x);
+      put(T, i, j, x);
+      put(ans, i, j, i == j);
+    }
+
+  for (int n = N; n > 0; n >>= 1) {
+    if (n & 1) {
+      mult(tmp, ans, T);
+      assign(ans, tmp);
+    }
+    mult(tmp, T, T);
+    assign(T, tmp);
+  }
+}
+
+int bench_fib_validate() {
+  return get(ans, M-1, M-1) == setting->checksum;
+}
--- a/microbench/src/lzip/lzip.c
+++ b/microbench/src/lzip/lzip.c
@ -0,0 +1,29 @@
+#include "quicklz.h"
+#include <benchmark.h>
+
+static int SIZE;
+
+static qlz_state_compress *state;
+static char *blk;
+static char *compress;
+static int len;
+
+void bench_lzip_prepare() {
+  SIZE = setting->size;
+  bench_srand(1);
+  state = bench_alloc(sizeof(qlz_state_compress));
+  blk = bench_alloc(SIZE);
+  compress = bench_alloc(SIZE + 400);
+  for (int i = 0; i < SIZE; i ++) {
+    blk[i] = 'a' + bench_rand() % 26;
+  }
+}
+
+void bench_lzip_run() {
+  len = qlz_compress(blk, compress, SIZE, state);
+}
+
+int bench_lzip_validate() {
+  return checksum(compress, compress + len) == setting->checksum;
+}
+
--- a/microbench/src/lzip/quicklz.c
+++ b/microbench/src/lzip/quicklz.c
@ -0,0 +1,761 @@
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
+// released into public must be open source) or under a commercial license if such
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license
+// does not cover derived or ported versions created by third parties under GPL.
+
+// 1.5.0 final
+
+#include "quicklz.h"
+
+#if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0
+	#error quicklz.c and quicklz.h have different versions
+#endif
+
+#define MINOFFSET 2
+#define UNCONDITIONAL_MATCHLEN 6
+#define UNCOMPRESSED_END 4
+#define CWORD_LEN 4
+
+#if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	#define OFFSET_BASE source
+	#define CAST (ui32)(size_t)
+#else
+	#define OFFSET_BASE 0
+	#define CAST
+#endif
+
+int qlz_get_setting(int setting)
+{
+	switch (setting)
+	{
+		case 0: return QLZ_COMPRESSION_LEVEL;
+		case 1: return sizeof(qlz_state_compress);
+		case 2: return sizeof(qlz_state_decompress);
+		case 3: return QLZ_STREAMING_BUFFER;
+#ifdef QLZ_MEMORY_SAFE
+		case 6: return 1;
+#else
+		case 6: return 0;
+#endif
+		case 7: return QLZ_VERSION_MAJOR;
+		case 8: return QLZ_VERSION_MINOR;
+		case 9: return QLZ_VERSION_REVISION;
+	}
+	return -1;
+}
+
+#if QLZ_COMPRESSION_LEVEL == 1
+static int same(const unsigned char *src, size_t n)
+{
+	while(n > 0 && *(src + n) == *src)
+		n--;
+	return n == 0 ? 1 : 0;
+}
+#endif
+
+static void reset_table_compress(qlz_state_compress *state)
+{
+	int i;
+	for(i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+#if QLZ_COMPRESSION_LEVEL == 1
+		state->hash[i].offset = 0;
+#else
+		state->hash_counter[i] = 0;
+#endif
+	}
+}
+
+static void reset_table_decompress(qlz_state_decompress *state)
+{
+	int i;
+	(void)state;
+	(void)i;
+#if QLZ_COMPRESSION_LEVEL == 2
+	for(i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+		state->hash_counter[i] = 0;
+	}
+#endif
+}
+
+static __inline ui32 hash_func(ui32 i)
+{
+#if QLZ_COMPRESSION_LEVEL == 2
+	return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1);
+#else
+	return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1);
+#endif
+}
+
+static __inline ui32 fast_read(void const *src, ui32 bytes)
+{
+  uint32_t ret = 0;
+	if (bytes >= 1 && bytes <= 4) {
+    for (uint32_t i = 0; i < bytes; i ++) {
+      ret |= ((uint8_t*)src)[i] << (i * 8);
+    }
+  }
+  return ret;
+}
+
+static __inline ui32 hashat(const unsigned char *src)
+{
+	ui32 fetch, hash;
+	fetch = fast_read(src, 3);
+	hash = hash_func(fetch);
+	return hash;
+}
+
+static __inline void fast_write(ui32 f, void *dst, size_t bytes)
+{
+  for (size_t i = 0; i != bytes; i ++) {
+    ((char*)dst)[i] = ((char*)&f)[i];
+  }
+}
+
+
+size_t qlz_size_decompressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1 + n, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+size_t qlz_size_compressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+size_t qlz_size_header(const char *source)
+{
+	size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1;
+	return n;
+}
+
+
+static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n)
+{
+  assert(0); // unaligned memory access
+}
+
+static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s)
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 hash;
+	hash = hashat(s);
+	state->hash[hash].offset = s;
+	state->hash_counter[hash] = 1;
+#elif QLZ_COMPRESSION_LEVEL == 2
+	ui32 hash;
+	unsigned char c;
+	hash = hashat(s);
+	c = state->hash_counter[hash];
+	state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s;
+	c++;
+	state->hash_counter[hash] = c;
+#endif
+	(void)state;
+	(void)s;
+}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max)
+{
+	while(*lh < max)
+	{
+		(*lh)++;
+		update_hash(state, *lh);
+	}
+}
+#endif
+
+static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state)
+{
+	const unsigned char *last_byte = source + size - 1;
+	const unsigned char *src = source;
+	unsigned char *cword_ptr = destination;
+	unsigned char *dst = destination + CWORD_LEN;
+	ui32 cword_val = 1U << 31;
+	const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
+	ui32 fetch = 0;
+	unsigned int lits = 0;
+
+	(void) lits;
+
+	if(src <= last_matchstart)
+		fetch = fast_read(src, 3);
+
+	while(src <= last_matchstart)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			// store uncompressed if compression ratio is too low
+			if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5))
+				return 0;
+
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+			fetch = fast_read(src, 3);
+		}
+#if QLZ_COMPRESSION_LEVEL == 1
+		{
+			const unsigned char *o;
+			ui32 hash, cached;
+
+			hash = hash_func(fetch);
+			cached = fetch ^ state->hash[hash].cache;
+			state->hash[hash].cache = fetch;
+
+			o = state->hash[hash].offset + OFFSET_BASE;
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+
+			if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
+			{
+				if (*(o + 3) != *(src + 3))
+				{
+					hash <<= 4;
+					cword_val = (cword_val >> 1) | (1U << 31);
+					fast_write((3 - 2) | hash, dst, 2);
+					src += 3;
+					dst += 2;
+				}
+				else
+				{
+					const unsigned char *old_src = src;
+					size_t matchlen;
+					hash <<= 4;
+
+					cword_val = (cword_val >> 1) | (1U << 31);
+					src += 4;
+
+					if(*(o + (src - old_src)) == *src)
+					{
+						src++;
+						if(*(o + (src - old_src)) == *src)
+						{
+							size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1;
+							size_t remaining = q > 255 ? 255 : q;
+							src++;
+							while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining)
+								src++;
+						}
+					}
+
+					matchlen = src - old_src;
+					if (matchlen < 18)
+					{
+						fast_write((ui32)(matchlen - 2) | hash, dst, 2);
+						dst += 2;
+					}
+					else
+					{
+						fast_write((ui32)(matchlen << 16) | hash, dst, 3);
+						dst += 3;
+					}
+				}
+				fetch = fast_read(src, 3);
+				lits = 0;
+			}
+			else
+			{
+				lits++;
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+				fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16);
+			}
+		}
+#elif QLZ_COMPRESSION_LEVEL >= 2
+		{
+			const unsigned char *o, *offset2;
+			ui32 hash, matchlen, k, m, best_k = 0;
+			unsigned char c;
+			size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1);
+			(void)best_k;
+
+
+			//hash = hashat(src);
+			fetch = fast_read(src, 3);
+			hash = hash_func(fetch);
+
+			c = state->hash_counter[hash];
+
+			offset2 = state->hash[hash].offset[0];
+			if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0)
+			{
+				matchlen = 3;
+				if(*(offset2 + matchlen) == *(src + matchlen))
+				{
+					matchlen = 4;
+					while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining)
+						matchlen++;
+				}
+			}
+			else
+				matchlen = 0;
+			for(k = 1; k < QLZ_POINTERS && c > k; k++)
+			{
+				o = state->hash[hash].offset[k];
+#if QLZ_COMPRESSION_LEVEL == 3
+				if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#elif QLZ_COMPRESSION_LEVEL == 2
+				if(*(src + matchlen) == *(o + matchlen)	&& ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#endif
+				{
+					m = 3;
+					while(*(o + m) == *(src + m) && m < remaining)
+						m++;
+#if QLZ_COMPRESSION_LEVEL == 3
+					if ((m > matchlen) || (m == matchlen && o > offset2))
+#elif QLZ_COMPRESSION_LEVEL == 2
+					if (m > matchlen)
+#endif
+					{
+						offset2 = o;
+						matchlen = m;
+						best_k = k;
+					}
+				}
+			}
+			o = offset2;
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+
+#if QLZ_COMPRESSION_LEVEL == 3
+			if(matchlen > 2 && src - o < 131071)
+			{
+				ui32 u;
+				size_t offset = src - o;
+
+				for(u = 1; u < matchlen; u++)
+				{
+					hash = hashat(src + u);
+					c = state->hash_counter[hash]++;
+					state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u;
+				}
+
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;
+
+				if(matchlen == 3 && offset <= 63)
+				{
+					*dst = (unsigned char)(offset << 2);
+					dst++;
+				}
+				else if (matchlen == 3 && offset <= 16383)
+				{
+					ui32 f = (ui32)((offset << 2) | 1);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+				else if (matchlen <= 18 && offset <= 1023)
+				{
+					ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2;
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+
+				else if(matchlen <= 33)
+				{
+					ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3;
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+				else
+				{
+					ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3;
+					fast_write(f, dst, 4);
+					dst += 4;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#elif QLZ_COMPRESSION_LEVEL == 2
+
+			if(matchlen > 2)
+			{
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;
+
+				if (matchlen < 10)
+				{
+					ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+				else
+				{
+					ui32 f = best_k | (matchlen << 16) | (hash << 5);
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#endif
+		}
+#endif
+	}
+	while (src <= last_byte)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+		}
+#if QLZ_COMPRESSION_LEVEL < 3
+		if (src <= last_byte - 3)
+		{
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash, fetch;
+			fetch = fast_read(src, 3);
+			hash = hash_func(fetch);
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+			state->hash[hash].cache = fetch;
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			hash = hashat(src);
+			c = state->hash_counter[hash];
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+#endif
+		}
+#endif
+		*dst = *src;
+		src++;
+		dst++;
+		cword_val = (cword_val >> 1);
+	}
+
+	while((cword_val & 1) != 1)
+		cword_val = (cword_val >> 1);
+
+	fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+	// min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument
+	return dst - destination < 9 ? 9 : dst - destination;
+}
+
+static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history)
+{
+	const unsigned char *src = source + qlz_size_header((const char *)source);
+	unsigned char *dst = destination;
+	const unsigned char *last_destination_byte = destination + size - 1;
+	ui32 cword_val = 1;
+	const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
+	unsigned char *last_hashed = destination - 1;
+	const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1;
+	static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+	(void) last_source_byte;
+	(void) last_hashed;
+	(void) state;
+	(void) history;
+
+	for(;;)
+	{
+		ui32 fetch;
+
+		if (cword_val == 1)
+		{
+#ifdef QLZ_MEMORY_SAFE
+			if(src + CWORD_LEN - 1 > last_source_byte)
+				return 0;
+#endif
+			cword_val = fast_read(src, CWORD_LEN);
+			src += CWORD_LEN;
+		}
+
+#ifdef QLZ_MEMORY_SAFE
+			if(src + 4 - 1 > last_source_byte)
+				return 0;
+#endif
+
+		fetch = fast_read(src, 4);
+
+		if ((cword_val & 1) == 1)
+		{
+			ui32 matchlen;
+			const unsigned char *offset2;
+
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 4) & 0xfff;
+			offset2 = (const unsigned char *)(size_t)state->hash[hash].offset;
+
+			if((fetch & 0xf) != 0)
+			{
+				matchlen = (fetch & 0xf) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;
+			}
+
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 5) & 0x7ff;
+			c = (unsigned char)(fetch & 0x3);
+			offset2 = state->hash[hash].offset[c];
+
+			if((fetch & (28)) != 0)
+			{
+				matchlen = ((fetch >> 2) & 0x7) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;
+			}
+
+#elif QLZ_COMPRESSION_LEVEL == 3
+			ui32 offset;
+			cword_val = cword_val >> 1;
+			if ((fetch & 3) == 0)
+			{
+				offset = (fetch & 0xff) >> 2;
+				matchlen = 3;
+				src++;
+			}
+			else if ((fetch & 2) == 0)
+			{
+				offset = (fetch & 0xffff) >> 2;
+				matchlen = 3;
+				src += 2;
+			}
+			else if ((fetch & 1) == 0)
+			{
+				offset = (fetch & 0xffff) >> 6;
+				matchlen = ((fetch >> 2) & 15) + 3;
+				src += 2;
+			}
+			else if ((fetch & 127) != 3)
+			{
+				offset = (fetch >> 7) & 0x1ffff;
+				matchlen = ((fetch >> 2) & 0x1f) + 2;
+				src += 3;
+			}
+			else
+			{
+				offset = (fetch >> 15);
+				matchlen = ((fetch >> 7) & 255) + 3;
+				src += 4;
+			}
+
+			offset2 = dst - offset;
+#endif
+
+#ifdef QLZ_MEMORY_SAFE
+			if(offset2 < history || offset2 > dst - MINOFFSET - 1)
+				return 0;
+
+			if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1))
+				return 0;
+#endif
+
+			memcpy_up(dst, offset2, matchlen);
+			dst += matchlen;
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+			update_hash_upto(state, &last_hashed, dst - matchlen);
+			last_hashed = dst - 1;
+#endif
+		}
+		else
+		{
+			if (dst < last_matchstart)
+			{
+				unsigned int n = bitlut[cword_val & 0xf];
+				memcpy_up(dst, src, 4);
+				cword_val = cword_val >> n;
+				dst += n;
+				src += n;
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, dst - 3);
+#endif
+			}
+			else
+			{
+				while(dst <= last_destination_byte)
+				{
+					if (cword_val == 1)
+					{
+						src += CWORD_LEN;
+						cword_val = 1U << 31;
+					}
+#ifdef QLZ_MEMORY_SAFE
+					if(src >= last_source_byte + 1)
+						return 0;
+#endif
+					*dst = *src;
+					dst++;
+					src++;
+					cword_val = cword_val >> 1;
+				}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant
+#endif
+				return size;
+			}
+
+		}
+	}
+}
+
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state)
+{
+	size_t r;
+	ui32 compressed;
+	size_t base;
+
+	if(size == 0 || size > 0xffffffff - 400)
+		return 0;
+
+	if(size < 216)
+		base = 3;
+	else
+		base = 9;
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER)
+#endif
+	{
+		reset_table_compress(state);
+		r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state);
+#if QLZ_STREAMING_BUFFER > 0
+		reset_table_compress(state);
+#endif
+		if(r == base)
+		{
+			bench_memcpy(destination + base, source, size);
+			r = size + base;
+			compressed = 0;
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter = 0;
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *src = state->stream_buffer + state->stream_counter;
+
+		bench_memcpy(src, source, size);
+		r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state);
+
+ 		if(r == base)
+		{
+			bench_memcpy(destination + base, src, size);
+			r = size + base;
+			compressed = 0;
+			reset_table_compress(state);
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter += size;
+	}
+#endif
+	if(base == 3)
+	{
+		*destination = (unsigned char)(0 | compressed);
+		*(destination + 1) = (unsigned char)r;
+		*(destination + 2) = (unsigned char)size;
+	}
+	else
+	{
+		*destination = (unsigned char)(2 | compressed);
+		fast_write((ui32)r, destination + 1, 4);
+		fast_write((ui32)size, destination + 5, 4);
+	}
+
+	*destination |= (QLZ_COMPRESSION_LEVEL << 2);
+	*destination |= (1 << 6);
+	*destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4);
+
+// 76543210
+// 01SSLLHC
+
+	return r;
+}
+
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state)
+{
+	size_t dsiz = qlz_size_decompressed(source);
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER)
+#endif
+	{
+		if((*source & 1) == 1)
+		{
+			reset_table_decompress(state);
+			dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination);
+		}
+		else
+		{
+			bench_memcpy(destination, source + qlz_size_header(source), dsiz);
+		}
+		state->stream_counter = 0;
+		reset_table_decompress(state);
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *dst = state->stream_buffer + state->stream_counter;
+		if((*source & 1) == 1)
+		{
+			dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer);
+		}
+		else
+		{
+			bench_memcpy(dst, source + qlz_size_header(source), dsiz);
+			reset_table_decompress(state);
+		}
+		bench_memcpy(destination, dst, dsiz);
+		state->stream_counter += dsiz;
+	}
+#endif
+	return dsiz;
+}
+
--- a/microbench/src/lzip/quicklz.h
+++ b/microbench/src/lzip/quicklz.h
@ -0,0 +1,164 @@
+#ifndef QLZ_HEADER
+#define QLZ_HEADER
+
+#include <am.h>
+#include <klib.h>
+
+static inline void* bench_memcpy(void* dst, const void* src, size_t n){
+  assert(dst&&src);
+  const char* s;
+  char* d;
+  if(src+n>dst&&src<dst){
+    s=src+n;
+    d=dst+n;
+    while(n-->0)*--d=*--s;
+  }
+  else{
+    s=src;
+    d=dst;
+    while(n-->0)*d++=*s++;
+  }
+  return dst;
+}
+
+
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
+// released into public must be open source) or under a commercial license if such
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license
+// does not cover derived or ported versions created by third parties under GPL.
+
+// You can edit following user settings. Data must be decompressed with the same
+// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed
+// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially
+// zeroed out (see manual). First #ifndef makes it possible to define settings from
+// the outside like the compiler command line.
+
+// 1.5.0 final
+
+#ifndef QLZ_COMPRESSION_LEVEL
+
+	// 1 gives fastest compression speed. 3 gives fastest decompression speed and best
+	// compression ratio.
+	//#define QLZ_COMPRESSION_LEVEL 1
+	//#define QLZ_COMPRESSION_LEVEL 2
+	//#define QLZ_COMPRESSION_LEVEL 3
+	#define QLZ_COMPRESSION_LEVEL 2
+
+	// If > 0, zero out both states prior to first call to qlz_compress() or qlz_decompress()
+	// and decompress packets in the same order as they were compressed
+	#define QLZ_STREAMING_BUFFER 0
+	//#define QLZ_STREAMING_BUFFER 100000
+	//#define QLZ_STREAMING_BUFFER 1000000
+
+	// Guarantees that decompression of corrupted data cannot crash. Decreases decompression
+	// speed 10-20%. Compression speed not affected.
+	//#define QLZ_MEMORY_SAFE
+#endif
+
+#define QLZ_VERSION_MAJOR 1
+#define QLZ_VERSION_MINOR 5
+#define QLZ_VERSION_REVISION 0
+
+// Verify compression level
+#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3
+#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3
+#endif
+
+typedef unsigned int ui32;
+typedef unsigned short int ui16;
+
+// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values!
+#if QLZ_COMPRESSION_LEVEL == 1
+#define QLZ_POINTERS 1
+#define QLZ_HASH_VALUES 4096
+#elif QLZ_COMPRESSION_LEVEL == 2
+#define QLZ_POINTERS 4
+#define QLZ_HASH_VALUES 2048
+#elif QLZ_COMPRESSION_LEVEL == 3
+#define QLZ_POINTERS 16
+#define QLZ_HASH_VALUES 4096
+#endif
+
+// hash entry
+typedef struct
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 cache;
+#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	unsigned int offset;
+#else
+	const unsigned char *offset;
+#endif
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+
+} qlz_hash_compress;
+
+typedef struct
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	const unsigned char *offset;
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+} qlz_hash_decompress;
+
+
+// states
+typedef struct
+{
+	#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+	#endif
+	size_t stream_counter;
+	qlz_hash_compress hash[QLZ_HASH_VALUES];
+	unsigned char hash_counter[QLZ_HASH_VALUES];
+} qlz_state_compress;
+
+
+#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+		unsigned char hash_counter[QLZ_HASH_VALUES];
+		size_t stream_counter;
+	} qlz_state_decompress;
+#elif QLZ_COMPRESSION_LEVEL == 3
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+#if QLZ_COMPRESSION_LEVEL <= 2
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+#endif
+		size_t stream_counter;
+	} qlz_state_decompress;
+#endif
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+// Public functions of QuickLZ
+size_t qlz_size_decompressed(const char *source);
+size_t qlz_size_compressed(const char *source);
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state);
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state);
+int qlz_get_setting(int setting);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
+
--- a/microbench/src/md5/md5.c
+++ b/microbench/src/md5/md5.c
@ -0,0 +1,159 @@
+/*
+ * Simple MD5 implementation (github.com/pod32g/md5)
+ *
+ */
+
+#include <benchmark.h>
+
+static int N;
+
+// Constants are the integer part of the sines of integers (in radians) * 2^32.
+const uint32_t k[64] = {
+0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee ,
+0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 ,
+0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be ,
+0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 ,
+0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa ,
+0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 ,
+0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed ,
+0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a ,
+0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c ,
+0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 ,
+0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 ,
+0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 ,
+0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 ,
+0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 ,
+0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 ,
+0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
+
+// r specifies the per-round shift amounts
+static const uint32_t r[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+                 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
+                 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+                 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+
+// leftrotate function definition
+#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
+
+static void to_bytes(uint32_t val, uint8_t *bytes)
+{
+    bytes[0] = (uint8_t) val;
+    bytes[1] = (uint8_t) (val >> 8);
+    bytes[2] = (uint8_t) (val >> 16);
+    bytes[3] = (uint8_t) (val >> 24);
+}
+
+static uint32_t to_int32(const uint8_t *bytes)
+{
+    return (uint32_t) bytes[0]
+        | ((uint32_t) bytes[1] << 8)
+        | ((uint32_t) bytes[2] << 16)
+        | ((uint32_t) bytes[3] << 24);
+}
+
+static void md5(uint8_t *msg, size_t initial_len, uint8_t *digest) {
+
+    // These vars will contain the hash
+    uint32_t h0, h1, h2, h3;
+
+    size_t new_len, offset;
+    uint32_t w[16];
+    uint32_t a, b, c, d, i, f, g, temp;
+
+    // Initialize variables - simple count in nibbles:
+    h0 = 0x67452301;
+    h1 = 0xefcdab89;
+    h2 = 0x98badcfe;
+    h3 = 0x10325476;
+
+    //Pre-processing:
+    //append "1" bit to message
+    //append "0" bits until message length in bits ≡ 448 (mod 512)
+    //append length mod (2^64) to message
+
+    for (new_len = initial_len + 1; new_len % (512/8) != 448/8; new_len++)
+        ;
+
+    msg[initial_len] = 0x80; // append the "1" bit; most significant bit is "first"
+    for (offset = initial_len + 1; offset < new_len; offset++)
+        msg[offset] = 0; // append "0" bits
+
+    // append the len in bits at the end of the buffer.
+    to_bytes(initial_len*8, msg + new_len);
+    // initial_len>>29 == initial_len*8>>32, but avoids overflow.
+    to_bytes(initial_len>>29, msg + new_len + 4);
+
+    // Process the message in successive 512-bit chunks:
+    //for each 512-bit chunk of message:
+    for(offset=0; offset<new_len; offset += (512/8)) {
+
+        // break chunk into sixteen 32-bit words w[j], 0 ≤ j ≤ 15
+        for (i = 0; i < 16; i++)
+            w[i] = to_int32(msg + offset + i*4);
+
+        // Initialize hash value for this chunk:
+        a = h0;
+        b = h1;
+        c = h2;
+        d = h3;
+
+        // Main loop:
+        for(i = 0; i<64; i++) {
+
+            if (i < 16) {
+                f = (b & c) | ((~b) & d);
+                g = i;
+            } else if (i < 32) {
+                f = (d & b) | ((~d) & c);
+                g = (5*i + 1) % 16;
+            } else if (i < 48) {
+                f = b ^ c ^ d;
+                g = (3*i + 5) % 16;
+            } else {
+                f = c ^ (b | (~d));
+                g = (7*i) % 16;
+            }
+
+            temp = d;
+            d = c;
+            c = b;
+            b = b + LEFTROTATE((a + f + k[i] + w[g]), r[i]);
+            a = temp;
+
+        }
+
+        // Add this chunk's hash to result so far:
+        h0 += a;
+        h1 += b;
+        h2 += c;
+        h3 += d;
+
+    }
+
+    //var char digest[16] := h0 append h1 append h2 append h3 //(Output is in little-endian)
+    to_bytes(h0, digest);
+    to_bytes(h1, digest + 4);
+    to_bytes(h2, digest + 8);
+    to_bytes(h3, digest + 12);
+}
+
+static uint8_t *str;
+static uint8_t *digest;
+
+void bench_md5_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  str = bench_alloc(N);
+  for (int i = 0; i < N; i ++) {
+    str[i] = bench_rand();
+  }
+  digest = bench_alloc(16);
+}
+
+void bench_md5_run() {
+  md5(str, N, digest);
+}
+
+int bench_md5_validate() {
+  return checksum(digest, digest + 16) == setting->checksum;
+}
--- a/microbench/src/qsort/qsort.c
+++ b/microbench/src/qsort/qsort.c
@ -0,0 +1,44 @@
+#include <benchmark.h>
+
+static int N, *data;
+
+void bench_qsort_prepare() {
+  bench_srand(1);
+
+  N = setting->size;
+
+  data = bench_alloc(N * sizeof(int));
+  for (int i = 0; i < N; i ++) {
+    int a = bench_rand();
+    int b = bench_rand();
+    data[i] = (a << 16) | b;
+  }
+}
+
+static void swap(int *a, int *b) {
+  int t = *a;
+  *a = *b;
+  *b = t;
+}
+
+static void myqsort(int *a, int l, int r) {
+  if (l < r) {
+    int p = a[l], pivot = l, j;
+    for (j = l + 1; j < r; j ++) {
+      if (a[j] < p) {
+        swap(&a[++pivot], &a[j]);
+      }
+    }
+    swap(&a[pivot], &a[l]);
+    myqsort(a, l, pivot);
+    myqsort(a, pivot + 1, r);
+  }
+}
+
+void bench_qsort_run() {
+  myqsort(data, 0, N);
+}
+
+int bench_qsort_validate() {
+  return checksum(data, data + N) == setting->checksum;
+}
--- a/microbench/src/queen/queen.c
+++ b/microbench/src/queen/queen.c
@ -0,0 +1,32 @@
+#include <benchmark.h>
+
+static unsigned int FULL;
+
+static unsigned int dfs(unsigned int row, unsigned int ld, unsigned int rd) {
+  if (row == FULL) {
+    return 1;
+  } else {
+    unsigned int pos = FULL & (~(row | ld | rd)), ans = 0;
+    while (pos) {
+      unsigned int p = (pos & (~pos + 1));
+      pos -= p;
+      ans += dfs(row | p, (ld | p) << 1, (rd | p) >> 1);
+    }
+    return ans;
+  }
+}
+
+static unsigned int ans;
+
+void bench_queen_prepare() {
+  ans = 0;
+  FULL = (1 << setting->size) - 1;
+}
+
+void bench_queen_run() {
+  ans = dfs(0, 0, 0);
+}
+
+int bench_queen_validate() {
+  return ans == setting->checksum;
+}
--- a/microbench/src/sieve/sieve.c
+++ b/microbench/src/sieve/sieve.c
@ -0,0 +1,42 @@
+#include <benchmark.h>
+
+static int N;
+
+static int ans;
+static uint32_t *primes;
+
+static inline int get(int n) {
+  return (primes[n >> 5] >> (n & 31)) & 1;
+}
+
+static inline void clear(int n) {
+  primes[n >> 5] &= ~(1ul << (n & 31));
+}
+
+void bench_sieve_prepare() {
+  N = setting->size;
+  primes = (uint32_t*)bench_alloc(N / 8 + 128);
+  for (int i = 0; i <= N / 32; i ++) {
+    primes[i] = 0xffffffff;
+  }
+}
+
+void bench_sieve_run() {
+  for (int i = 1; i <= N; i ++)
+    if (!get(i)) return;
+  for (int i = 2; i * i <= N; i ++) {
+    if (get(i)) {
+      for (int j = i + i; j <= N; j += i)
+        clear(j);
+    }
+  }
+  ans = 0;
+  for (int i = 2; i <= N; i ++)
+    if (get(i)) {
+      ans ++;
+    }
+}
+
+int bench_sieve_validate() {
+  return ans == setting->checksum;
+}
--- a/microbench/src/ssort/ssort.cc
+++ b/microbench/src/ssort/ssort.cc
@ -0,0 +1,111 @@
+// This is the Skew algorithm's reference implementation.
+
+#include <benchmark.h>
+
+static int N;
+
+inline bool leq(int a1, int a2,   int b1, int b2) { // lexic. order for pairs
+  return(a1 < b1 || (a1 == b1 && a2 <= b2));
+}                                                   // and triples
+inline bool leq(int a1, int a2, int a3,   int b1, int b2, int b3) {
+  return(a1 < b1 || (a1 == b1 && leq(a2,a3, b2,b3)));
+}
+// stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r
+static void radixPass(int* a, int* b, int* r, int n, int K)
+{ // count occurrences
+  int* c = (int*)bench_alloc(sizeof(int)*(K+1));
+  for (int i = 0;  i <= K;  i++) c[i] = 0;         // reset counters
+  for (int i = 0;  i < n;  i++) c[r[a[i]]]++;    // count occurences
+  for (int i = 0, sum = 0;  i <= K;  i++) { // exclusive prefix sums
+     int t = c[i];  c[i] = sum;  sum += t;
+  }
+  for (int i = 0;  i < n;  i++) b[c[r[a[i]]]++] = a[i];      // sort
+}
+
+// find the suffix array SA of s[0..n-1] in {1..K}^n
+// require s[n]=s[n+1]=s[n+2]=0, n>=2
+void suffixArray(int* s, int* SA, int n, int K) {
+  int n0=(n+2)/3, n1=(n+1)/3, n2=n/3, n02=n0+n2;
+  int* s12  = (int*)bench_alloc(sizeof(int)*(n02+3));  s12[n02]= s12[n02+1]= s12[n02+2]=0;
+  int* SA12 = (int*)bench_alloc(sizeof(int)*(n02+3)); SA12[n02]=SA12[n02+1]=SA12[n02+2]=0;
+  int* s0   = (int*)bench_alloc(sizeof(int)*n0);
+  int* SA0  = (int*)bench_alloc(sizeof(int)*n0);
+
+  // generate positions of mod 1 and mod  2 suffixes
+  // the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1
+  for (int i=0, j=0;  i < n+(n0-n1);  i++) if (i%3 != 0) s12[j++] = i;
+
+  // lsb radix sort the mod 1 and mod 2 triples
+  radixPass(s12 , SA12, s+2, n02, K);
+  radixPass(SA12, s12 , s+1, n02, K);
+  radixPass(s12 , SA12, s  , n02, K);
+
+  // find lexicographic names of triples
+  int name = 0, c0 = -1, c1 = -1, c2 = -1;
+  for (int i = 0;  i < n02;  i++) {
+    if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) {
+      name++;  c0 = s[SA12[i]];  c1 = s[SA12[i]+1];  c2 = s[SA12[i]+2];
+    }
+    if (SA12[i] % 3 == 1) { s12[SA12[i]/3]      = name; } // left half
+    else                  { s12[SA12[i]/3 + n0] = name; } // right half
+  }
+
+  // recurse if names are not yet unique
+  if (name < n02) {
+    suffixArray(s12, SA12, n02, name);
+    // store unique names in s12 using the suffix array
+    for (int i = 0;  i < n02;  i++) s12[SA12[i]] = i + 1;
+  } else // generate the suffix array of s12 directly
+    for (int i = 0;  i < n02;  i++) SA12[s12[i] - 1] = i;
+
+  // stably sort the mod 0 suffixes from SA12 by their first character
+  for (int i=0, j=0;  i < n02;  i++) if (SA12[i] < n0) s0[j++] = 3*SA12[i];
+  radixPass(s0, SA0, s, n0, K);
+
+  // merge sorted SA0 suffixes and sorted SA12 suffixes
+  for (int p=0,  t=n0-n1,  k=0;  k < n;  k++) {
+#define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2)
+    int i = GetI(); // pos of current offset 12 suffix
+    int j = SA0[p]; // pos of current offset 0  suffix
+    if (SA12[t] < n0 ?
+        leq(s[i],       s12[SA12[t] + n0], s[j],       s12[j/3]) :
+        leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]))
+    { // suffix from SA12 is smaller
+      SA[k] = i;  t++;
+      if (t == n02) { // done --- only SA0 suffixes left
+        for (k++;  p < n0;  p++, k++) SA[k] = SA0[p];
+      }
+    } else {
+      SA[k] = j;  p++;
+      if (p == n0)  { // done --- only SA12 suffixes left
+        for (k++;  t < n02;  t++, k++) SA[k] = GetI();
+      }
+    }
+  }
+}
+
+extern "C" {
+
+static int *s, *sa;
+
+void bench_ssort_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  s = (int*)bench_alloc(sizeof(int)*(N+10));
+  sa = (int*)bench_alloc(sizeof(int)*(N+10));
+
+  for (int i = 0; i < N; i ++) {
+    s[i] = bench_rand() % 26;
+  }
+}
+
+void bench_ssort_run() {
+  suffixArray(s, sa, N, 26);
+}
+
+int bench_ssort_validate() {
+  return checksum(sa, sa + N) == setting->checksum;
+}
+
+}
+
--- a/thread-os/Makefile
+++ b/thread-os/Makefile
@ -0,0 +1,3 @@
+NAME := thread-os
+SRCS := thread-os.c
+include $(AM_HOME)/Makefile
--- a/thread-os/thread-os.c
+++ b/thread-os/thread-os.c
@ -0,0 +1,71 @@
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+#define MAX_CPU 8
+
+typedef union task {
+  struct {
+    const char *name;
+    union task *next;
+    void      (*entry)(void *);
+    Context    *context;
+  };
+  uint8_t stack[8192];
+} Task;
+
+Task *currents[MAX_CPU];
+#define current currents[cpu_current()]
+
+// user-defined tasks
+
+int locked = 0;
+void lock()   { while (atomic_xchg(&locked, 1)); }
+void unlock() { atomic_xchg(&locked, 0); }
+
+void func(void *arg) {
+  while (1) {
+    lock();
+    printf("Thread-%s on CPU #%d\n", arg, cpu_current());
+    unlock();
+    for (int volatile i = 0; i < 100000; i++) ;
+  }
+}
+
+Task tasks[] = {
+  { .name = "A", .entry = func },
+  { .name = "B", .entry = func },
+  { .name = "C", .entry = func },
+  { .name = "D", .entry = func },
+  { .name = "E", .entry = func },
+};
+
+// ------------------
+
+Context *on_interrupt(Event ev, Context *ctx) {
+  extern Task tasks[];
+  if (!current) current = &tasks[0];
+  else          current->context = ctx;
+  do {
+    current = current->next;
+  } while ((current - tasks) % cpu_count() != cpu_current());
+  return current->context;
+}
+
+void mp_entry() {
+  iset(true);
+  yield();
+}
+
+int main() {
+  ioe_init();
+  cte_init(on_interrupt);
+
+  for (int i = 0; i < LENGTH(tasks); i++) {
+    Task *task    = &tasks[i];
+    Area stack    = (Area) { &task->context + 1, task + 1 };
+    task->context = kcontext(stack, task->entry, (void *)task->name);
+    task->next    = &tasks[(i + 1) % LENGTH(tasks)];
+  }
+  mpe_init(mp_entry);
+}