port some benchmarks

2020-08-11 17:17:16 +00:00 · 2020-08-11 17:17:16 +00:00 · 009f8b5fb0
commit 009f8b5fb0
parent 30a534e650
30 changed files with 5913 additions and 0 deletions
--- a/coremark/Makefile
+++ b/coremark/Makefile
@ -0,0 +1,3 @@
 NAME = coremark
 SRCS = $(shell find -L ./src/ -name "*.c")
 include $(AM_HOME)/Makefile
--- a/coremark/include/core_portme.h
+++ b/coremark/include/core_portme.h
@ -0,0 +1,188 @@
 /* Topic : Description
 	This file contains configuration constants required to execute on different platforms
 */
 #ifndef CORE_PORTME_H
 #define CORE_PORTME_H
 #include <am.h>
 #include <klib.h>
 #include <klib-macros.h>
 #define ITERATIONS 1000
 #define MEM_METHOD MEM_STATIC
 /************************/
 /* Data types and settings */
 /************************/
 /* Configuration : HAS_FLOAT
 	Define to 1 if the platform supports floating point.
 */
 #ifndef HAS_FLOAT
 #define HAS_FLOAT 0
 #endif
 /* Configuration : HAS_TIME_H
 	Define to 1 if platform has the time.h header file,
 	and implementation of functions thereof.
 */
 #ifndef HAS_TIME_H
 #define HAS_TIME_H 0
 #endif
 /* Configuration : USE_CLOCK
 	Define to 1 if platform has the time.h header file,
 	and implementation of functions thereof.
 */
 #ifndef USE_CLOCK
 #define USE_CLOCK 0
 #endif
 /* Configuration : HAS_STDIO
 	Define to 1 if the platform has stdio.h.
 */
 #ifndef HAS_STDIO
 #define HAS_STDIO 0
 #endif
 /* Configuration : HAS_PRINTF
 	Define to 1 if the platform has stdio.h and implements the printf function.
 */
 #ifndef HAS_PRINTF
 #define HAS_PRINTF 1
 #endif
 /* Configuration : CORE_TICKS
 	Define type of return from the timing functions.
 */
 typedef uint32_t CORE_TICKS;
 /* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
 	Initialize these strings per platform
 */
 #ifndef COMPILER_VERSION
 #ifdef __GNUC__
 #define COMPILER_VERSION "GCC"__VERSION__
 #else
 #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
 #endif
 #endif
 #ifndef COMPILER_FLAGS
 #define COMPILER_FLAGS
 #endif
 #ifndef MEM_LOCATION
 #define MEM_LOCATION "STACK"
 #endif
 /* Data Types :
 	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
 	*Imprtant* :
 	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
 */
 typedef signed short ee_s16;
 typedef unsigned short ee_u16;
 typedef signed int ee_s32;
 typedef double ee_f32;
 typedef unsigned char ee_u8;
 typedef unsigned int ee_u32;
 typedef unsigned long ee_ptr_int;
 typedef size_t ee_size_t;
 /* align_mem :
 	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
 */
 #define align_mem(x) (void *)(4 + (((unsigned long)(x) - 1) & ~3))
 /* Configuration : SEED_METHOD
 	Defines method to get seed values that cannot be computed at compile time.
 	Valid values :
 	SEED_ARG - from command line.
 	SEED_FUNC - from a system function.
 	SEED_VOLATILE - from volatile variables.
 */
 #ifndef SEED_METHOD
 #define SEED_METHOD SEED_VOLATILE
 #endif
 /* Configuration : MEM_METHOD
 	Defines method to get a block of memry.
 	Valid values :
 	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
 	MEM_STATIC - to use a static memory array.
 	MEM_STACK - to allocate the data block on the stack (NYI).
 */
 #ifndef MEM_METHOD
 #define MEM_METHOD MEM_STACK
 #endif
 /* Configuration : MULTITHREAD
 	Define for parallel execution
 	Valid values :
 	1 - only one context (default).
 	N>1 - will execute N copies in parallel.
 	Note :
 	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
 	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
 	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
 	to fit a particular architecture.
 */
 #ifndef MULTITHREAD
 #define MULTITHREAD 1
 #define USE_PTHREAD 0
 #define USE_FORK 0
 #define USE_SOCKET 0
 #endif
 /* Configuration : MAIN_HAS_NOARGC
 	Needed if platform does not support getting arguments to main.
 	Valid values :
 	0 - argc/argv to main is supported
 	1 - argc/argv to main is not supported
 	Note :
 	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
 */
 #ifndef MAIN_HAS_NOARGC
 #define MAIN_HAS_NOARGC 0
 #endif
 /* Configuration : MAIN_HAS_NORETURN
 	Needed if platform does not support returning a value from main.
 	Valid values :
 	0 - main returns an int, and return value will be 0.
 	1 - platform does not support returning a value from main
 */
 #ifndef MAIN_HAS_NORETURN
 #define MAIN_HAS_NORETURN 0
 #endif
 /* Variable : default_num_contexts
 	Not used for this simple port, must cintain the value 1.
 */
 extern ee_u32 default_num_contexts;
 typedef struct CORE_PORTABLE_S {
 	ee_u8	portable_id;
 } core_portable;
 /* target specific init/fini */
 void portable_init(core_portable *p, int *argc, char *argv[]);
 void portable_fini(core_portable *p);
 #if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
 #if (TOTAL_DATA_SIZE==1200)
 #define PROFILE_RUN 1
 #elif (TOTAL_DATA_SIZE==2000)
 #define PERFORMANCE_RUN 1
 #else
 #define VALIDATION_RUN 1
 #endif
 #endif
 #endif /* CORE_PORTME_H */
--- a/coremark/include/coremark.h
+++ b/coremark/include/coremark.h
@ -0,0 +1,174 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 /* Topic: Description
 	This file contains  declarations of the various benchmark functions.
 */
 /* Configuration: TOTAL_DATA_SIZE
 	Define total size for data algorithms will operate on
 */
 #ifndef TOTAL_DATA_SIZE
 #define TOTAL_DATA_SIZE 2*1000
 #endif
 #define SEED_ARG 0
 #define SEED_FUNC 1
 #define SEED_VOLATILE 2
 #define MEM_STATIC 0
 #define MEM_MALLOC 1
 #define MEM_STACK 2
 #include "core_portme.h"
 #if HAS_STDIO
 #include <stdio.h>
 #endif
 #if HAS_PRINTF
 #define ee_printf printf
 #endif
 /* Actual benchmark execution in iterate */
 void *iterate(void *pres);
 /* Typedef: secs_ret
 	For machines that have floating point support, get number of seconds as a double.
 	Otherwise an unsigned int.
 */
 #if HAS_FLOAT
 typedef double secs_ret;
 #else
 typedef ee_u32 secs_ret;
 #endif
 #if MAIN_HAS_NORETURN
 #define MAIN_RETURN_VAL
 #define MAIN_RETURN_TYPE void
 #else
 #define MAIN_RETURN_VAL 0
 #define MAIN_RETURN_TYPE int
 #endif
 void start_time(void);
 void stop_time(void);
 CORE_TICKS get_time(void);
 secs_ret time_in_secs(CORE_TICKS ticks);
 /* Misc useful functions */
 ee_u16 crcu8(ee_u8 data, ee_u16 crc);
 ee_u16 crc16(ee_s16 newval, ee_u16 crc);
 ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
 ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
 ee_u8 check_data_types();
 void *portable_malloc(ee_size_t size);
 void portable_free(void *p);
 ee_s32 parseval(char *valstring);
 /* Algorithm IDS */
 #define ID_LIST 	(1<<0)
 #define ID_MATRIX 	(1<<1)
 #define ID_STATE 	(1<<2)
 #define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE)
 #define NUM_ALGORITHMS 3
 /* list data structures */
 typedef struct list_data_s {
 	ee_s16 data16;
 	ee_s16 idx;
 } list_data;
 typedef struct list_head_s {
 	struct list_head_s *next;
 	struct list_data_s *info;
 } list_head;
 /*matrix benchmark related stuff */
 #define MATDAT_INT 1
 #if MATDAT_INT
 typedef ee_s16 MATDAT;
 typedef ee_s32 MATRES;
 #else
 typedef ee_f16 MATDAT;
 typedef ee_f32 MATRES;
 #endif
 typedef struct MAT_PARAMS_S {
 	int N;
 	MATDAT *A;
 	MATDAT *B;
 	MATRES *C;
 } mat_params;
 /* state machine related stuff */
 /* List of all the possible states for the FSM */
 typedef enum CORE_STATE {
 	CORE_START=0,
 	CORE_INVALID,
 	CORE_S1,
 	CORE_S2,
 	CORE_INT,
 	CORE_FLOAT,
 	CORE_EXPONENT,
 	CORE_SCIENTIFIC,
 	NUM_CORE_STATES
 } core_state_e ;
 /* Helper structure to hold results */
 typedef struct RESULTS_S {
 	/* inputs */
 	ee_s16	seed1;		/* Initializing seed */
 	ee_s16	seed2;		/* Initializing seed */
 	ee_s16	seed3;		/* Initializing seed */
 	void	*memblock[4];	/* Pointer to safe memory location */
 	ee_u32	size;		/* Size of the data */
 	ee_u32 iterations;		/* Number of iterations to execute */
 	ee_u32	execs;		/* Bitmask of operations to execute */
 	struct list_head_s *list;
 	mat_params mat;
 	/* outputs */
 	ee_u16	crc;
 	ee_u16	crclist;
 	ee_u16	crcmatrix;
 	ee_u16	crcstate;
 	ee_s16	err;
 	/* ultithread specific */
 	core_portable port;
 } core_results;
 /* Multicore execution handling */
 #if (MULTITHREAD>1)
 ee_u8 core_start_parallel(core_results *res);
 ee_u8 core_stop_parallel(core_results *res);
 #endif
 /* list benchmark functions */
 list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
 ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx);
 /* state benchmark functions */
 void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
 ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
 		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc);
 /* matrix benchmark functions */
 ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p);
 ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
--- a/coremark/src/core_list_join.c
+++ b/coremark/src/core_list_join.c
@ -0,0 +1,496 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 #include "coremark.h"
 /*
 Topic: Description
 	Benchmark using a linked list.
 	Linked list is a common data structure used in many applications.
 	For our purposes, this will excercise the memory units of the processor.
 	In particular, usage of the list pointers to find and alter data.
 	We are not using Malloc since some platforms do not support this library.
 	Instead, the memory block being passed in is used to create a list,
 	and the benchmark takes care not to add more items then can be
 	accomodated by the memory block. The porting layer will make sure
 	that we have a valid memory block.
 	All operations are done in place, without using any extra memory.
 	The list itself contains list pointers and pointers to data items.
 	Data items contain the following:
 	idx - An index that captures the initial order of the list.
 	data - Variable data initialized based on the input parameters. The 16b are divided as follows:
 	o Upper 8b are backup of original data.
 	o Bit 7 indicates if the lower 7 bits are to be used as is or calculated.
 	o Bits 0-2 indicate type of operation to perform to get a 7b value.
 	o Bits 3-6 provide input for the operation.
 */
 /* local functions */
 list_head *core_list_find(list_head *list,list_data *info);
 list_head *core_list_reverse(list_head *list);
 list_head *core_list_remove(list_head *item);
 list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified);
 list_head *core_list_insert_new(list_head *insert_point
 	, list_data *info, list_head **memblock, list_data **datablock
 	, list_head *memblock_end, list_data *datablock_end);
 typedef ee_s32(*list_cmp)(list_data *a, list_data *b, core_results *res);
 list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res);
 ee_s16 calc_func(ee_s16 *pdata, core_results *res) {
 	ee_s16 data=*pdata;
 	ee_s16 retval;
 	ee_u8 optype=(data>>7) & 1; /* bit 7 indicates if the function result has been cached */
 	if (optype) /* if cached, use cache */
 		return (data & 0x007f);
 	else { /* otherwise calculate and cache the result */
 		ee_s16 flag=data & 0x7; /* bits 0-2 is type of function to perform */
 		ee_s16 dtype=((data>>3) & 0xf); /* bits 3-6 is specific data for the operation */
 		dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */
 		switch (flag) {
 			case 0:
 				if (dtype<0x22) /* set min period for bit corruption */
 					dtype=0x22;
 				retval=core_bench_state(res->size,res->memblock[3],res->seed1,res->seed2,dtype,res->crc);
 				if (res->crcstate==0)
 					res->crcstate=retval;
 				break;
 			case 1:
 				retval=core_bench_matrix(&(res->mat),dtype,res->crc);
 				if (res->crcmatrix==0)
 					res->crcmatrix=retval;
 				break;
 			default:
 				retval=data;
 				break;
 		}
 		res->crc=crcu16(retval,res->crc);
 		retval &= 0x007f;
 		*pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */
 		return retval;
 	}
 }
 /* Function: cmp_complex
 	Compare the data item in a list cell.
 	Can be used by mergesort.
 */
 ee_s32 cmp_complex(list_data *a, list_data *b, core_results *res) {
 	ee_s16 val1=calc_func(&(a->data16),res);
 	ee_s16 val2=calc_func(&(b->data16),res);
 	return val1 - val2;
 }
 /* Function: cmp_idx
 	Compare the idx item in a list cell, and regen the data.
 	Can be used by mergesort.
 */
 ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) {
 	if (res==NULL) {
 		a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16>>8));
 		b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16>>8));
 	}
 	return a->idx - b->idx;
 }
 void copy_info(list_data *to,list_data *from) {
 	to->data16=from->data16;
 	to->idx=from->idx;
 }
 /* Benchmark for linked list:
 	- Try to find multiple data items.
 	- List sort
 	- Operate on data from list (crc)
 	- Single remove/reinsert
 	* At the end of this function, the list is back to original state
 */
 ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 	ee_u16 retval=0;
 	ee_u16 found=0,missed=0;
 	list_head *list=res->list;
 	ee_s16 find_num=res->seed3;
 	list_head *this_find;
 	list_head *finder, *remover;
 	list_data info = {};
 	ee_s16 i;
 	info.idx=finder_idx;
 	/* find <find_num> values in the list, and change the list each time (reverse and cache if value found) */
 	for (i=0; i<find_num; i++) {
 		info.data16= (i & 0xff) ;
 		this_find=core_list_find(list,&info);
 		list=core_list_reverse(list);
 		if (this_find==NULL) {
 			missed++;
 			retval+=(list->next->info->data16 >> 8) & 1;
 		}
 		else {
 			found++;
 			if (this_find->info->data16 & 0x1) /* use found value */
 				retval+=(this_find->info->data16 >> 9) & 1;
 			/* and cache next item at the head of the list (if any) */
 			if (this_find->next != NULL) {
 				finder = this_find->next;
 				this_find->next = finder->next;
 				finder->next=list->next;
 				list->next=finder;
 			}
 		}
 		if (info.idx>=0)
 			info.idx++;
 #if CORE_DEBUG
 	ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
 #endif
 	}
 	retval+=found*4-missed;
 	/* sort the list by data content and remove one item*/
 	if (finder_idx>0)
 		list=core_list_mergesort(list,cmp_complex,res);
 	remover=core_list_remove(list->next);
 	/* CRC data content of list from location of index N forward, and then undo remove */
 	finder=core_list_find(list,&info);
 	if (!finder)
 		finder=list->next;
 	while (finder) {
 		retval=crc16(list->info->data16,retval);
 		finder=finder->next;
 	}
 #if CORE_DEBUG
 	ee_printf("List sort 1: %04x\n",retval);
 #endif
 	remover=core_list_undo_remove(remover,list->next);
 	/* sort the list by index, in effect returning the list to original state */
 	list=core_list_mergesort(list,cmp_idx,NULL);
 	/* CRC data content of list */
 	finder=list->next;
 	while (finder) {
 		retval=crc16(list->info->data16,retval);
 		finder=finder->next;
 	}
 #if CORE_DEBUG
 	ee_printf("List sort 2: %04x\n",retval);
 #endif
 	return retval;
 }
 /* Function: core_list_init
 	Initialize list with data.
 	Parameters:
 	blksize - Size of memory to be initialized.
 	memblock - Pointer to memory block.
 	seed - 	Actual values chosen depend on the seed parameter.
 		The seed parameter MUST be supplied from a source that cannot be determined at compile time
 	Returns:
 	Pointer to the head of the list.
 */
 list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
 	/* calculated pointers for the list */
 	ee_u32 per_item=16+sizeof(struct list_data_s);
 	ee_u32 size=(blksize/per_item)-2; /* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
 	list_head *memblock_end=memblock+size;
 	list_data *datablock=(list_data *)(memblock_end);
 	list_data *datablock_end=datablock+size;
 	/* some useful variables */
 	ee_u32 i;
 	list_head *finder,*list=memblock;
 	list_data info;
 	/* create a fake items for the list head and tail */
 	list->next=NULL;
 	list->info=datablock;
 	list->info->idx=0x0000;
 	list->info->data16=(ee_s16)0x8080;
 	memblock++;
 	datablock++;
 	info.idx=0x7fff;
 	info.data16=(ee_s16)0xffff;
 	core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
 	/* then insert size items */
 	for (i=0; i<size; i++) {
 		ee_u16 datpat=((ee_u16)(seed^i) & 0xf);
 		ee_u16 dat=(datpat<<3) | (i&0x7); /* alternate between algorithms */
 		info.data16=(dat<<8) | dat;		/* fill the data with actual data and upper bits with rebuild value */
 		core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
 	}
 	/* and now index the list so we know initial seed order of the list */
 	finder=list->next;
 	i=1;
 	while (finder->next!=NULL) {
 		if (i<size/5) /* first 20% of the list in order */
 			finder->info->idx=i++;
 		else {
 			ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */
 			finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */
 		}
 		finder=finder->next;
 	}
 	list = core_list_mergesort(list,cmp_idx,NULL);
 #if CORE_DEBUG
 	ee_printf("Initialized list:\n");
 	finder=list;
 	while (finder) {
 		ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
 		finder=finder->next;
 	}
 	ee_printf("\n");
 #endif
 	return list;
 }
 /* Function: core_list_insert
 	Insert an item to the list
 	Parameters:
 	insert_point - where to insert the item.
 	info - data for the cell.
 	memblock - pointer for the list header
 	datablock - pointer for the list data
 	memblock_end - end of region for list headers
 	datablock_end - end of region for list data
 	Returns:
 	Pointer to new item.
 */
 list_head *core_list_insert_new(list_head *insert_point, list_data *info, list_head **memblock, list_data **datablock
 	, list_head *memblock_end, list_data *datablock_end) {
 	list_head *newitem;
 	if ((*memblock+1) >= memblock_end)
 		return NULL;
 	if ((*datablock+1) >= datablock_end)
 		return NULL;
 	newitem=*memblock;
 	(*memblock)++;
 	newitem->next=insert_point->next;
 	insert_point->next=newitem;
 	newitem->info=*datablock;
 	(*datablock)++;
 	copy_info(newitem->info,info);
 	return newitem;
 }
 /* Function: core_list_remove
 	Remove an item from the list.
 	Operation:
 	For a singly linked list, remove by copying the data from the next item
 	over to the current cell, and unlinking the next item.
 	Note:
 	since there is always a fake item at the end of the list, no need to check for NULL.
 	Returns:
 	Removed item.
 */
 list_head *core_list_remove(list_head *item) {
 	list_data *tmp;
 	list_head *ret=item->next;
 	/* swap data pointers */
 	tmp=item->info;
 	item->info=ret->info;
 	ret->info=tmp;
 	/* and eliminate item */
 	item->next=item->next->next;
 	ret->next=NULL;
 	return ret;
 }
 /* Function: core_list_undo_remove
 	Undo a remove operation.
 	Operation:
 	Since we want each iteration of the benchmark to be exactly the same,
 	we need to be able to undo a remove.
 	Link the removed item back into the list, and switch the info items.
 	Parameters:
 	item_removed - Return value from the <core_list_remove>
 	item_modified - List item that was modified during <core_list_remove>
 	Returns:
 	The item that was linked back to the list.
 */
 list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified) {
 	list_data *tmp;
 	/* swap data pointers */
 	tmp=item_removed->info;
 	item_removed->info=item_modified->info;
 	item_modified->info=tmp;
 	/* and insert item */
 	item_removed->next=item_modified->next;
 	item_modified->next=item_removed;
 	return item_removed;
 }
 /* Function: core_list_find
 	Find an item in the list
 	Operation:
 	Find an item by idx (if not 0) or specific data value
 	Parameters:
 	list - list head
 	info - idx or data to find
 	Returns:
 	Found item, or NULL if not found.
 */
 list_head *core_list_find(list_head *list,list_data *info) {
 	if (info->idx>=0) {
 		while (list && (list->info->idx != info->idx))
 			list=list->next;
 		return list;
 	} else {
 		while (list && ((list->info->data16 & 0xff) != info->data16))
 			list=list->next;
 		return list;
 	}
 }
 /* Function: core_list_reverse
 	Reverse a list
 	Operation:
 	Rearrange the pointers so the list is reversed.
 	Parameters:
 	list - list head
 	info - idx or data to find
 	Returns:
 	Found item, or NULL if not found.
 */
 list_head *core_list_reverse(list_head *list) {
 	list_head *next=NULL, *tmp;
 	while (list) {
 		tmp=list->next;
 		list->next=next;
 		next=list;
 		list=tmp;
 	}
 	return next;
 }
 /* Function: core_list_mergesort
 	Sort the list in place without recursion.
 	Description:
 	Use mergesort, as for linked list this is a realistic solution.
 	Also, since this is aimed at embedded, care was taken to use iterative rather then recursive algorithm.
 	The sort can either return the list to original order (by idx) ,
 	or use the data item to invoke other other algorithms and change the order of the list.
 	Parameters:
 	list - list to be sorted.
 	cmp - cmp function to use
 	Returns:
 	New head of the list.
 	Note:
 	We have a special header for the list that will always be first,
 	but the algorithm could theoretically modify where the list starts.
 */
 list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res) {
    list_head *p, *q, *e, *tail;
    ee_s32 insize, nmerges, psize, qsize, i;
    insize = 1;
    while (1) {
        p = list;
        list = NULL;
        tail = NULL;
        nmerges = 0;  /* count number of merges we do in this pass */
        while (p) {
            nmerges++;  /* there exists a merge to be done */
            /* step `insize' places along from p */
            q = p;
            psize = 0;
            for (i = 0; i < insize; i++) {
                psize++;
 			    q = q->next;
                if (!q) break;
            }
            /* if q hasn't fallen off end, we have two lists to merge */
            qsize = insize;
            /* now we have two lists; merge them */
            while (psize > 0 || (qsize > 0 && q)) {
 				/* decide whether next element of merge comes from p or q */
 				if (psize == 0) {
 				    /* p is empty; e must come from q. */
 				    e = q; q = q->next; qsize--;
 				} else if (qsize == 0 || !q) {
 				    /* q is empty; e must come from p. */
 				    e = p; p = p->next; psize--;
 				} else if (cmp(p->info,q->info,res) <= 0) {
 				    /* First element of p is lower (or same); e must come from p. */
 				    e = p; p = p->next; psize--;
 				} else {
 				    /* First element of q is lower; e must come from q. */
 				    e = q; q = q->next; qsize--;
 				}
 		        /* add the next element to the merged list */
 				if (tail) {
 				    tail->next = e;
 				} else {
 				    list = e;
 				}
 				tail = e;
 	        }
 			/* now p has stepped `insize' places along, and q has too */
 			p = q;
        }
 	    tail->next = NULL;
        /* If we have done only one merge, we're finished. */
        if (nmerges <= 1)   /* allow for nmerges==0, the empty list case */
            return list;
        /* Otherwise repeat, merging lists twice the size */
        insize *= 2;
    }
 #if COMPILER_REQUIRES_SORT_RETURN
 	return list;
 #endif
 }
--- a/coremark/src/core_main.c
+++ b/coremark/src/core_main.c
@ -0,0 +1,339 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 /* File: core_main.c
 	This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results.
 */
 #include "coremark.h"
 /* Function: iterate
 	Run the benchmark for a specified number of iterations.
 	Operation:
 	For each type of benchmarked algorithm:
 		a - Initialize the data block for the algorithm.
 		b - Execute the algorithm N times.
 	Returns:
 	NULL.
 */
 static ee_u16 list_known_crc[]   =      {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1};
 static ee_u16 matrix_known_crc[] =      {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747};
 static ee_u16 state_known_crc[]  =      {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84};
 void *iterate(void *pres) {
 	ee_u32 i;
 	ee_u16 crc;
 	core_results *res=(core_results *)pres;
 	ee_u32 iterations=res->iterations;
 	res->crc=0;
 	res->crclist=0;
 	res->crcmatrix=0;
 	res->crcstate=0;
 	for (i=0; i<iterations; i++) {
 		crc=core_bench_list(res,1);
 		res->crc=crcu16(crc,res->crc);
 		crc=core_bench_list(res,-1);
 		res->crc=crcu16(crc,res->crc);
 		if (i==0) res->crclist=res->crc;
 	}
 	return NULL;
 }
 #if (SEED_METHOD==SEED_ARG)
 ee_s32 get_seed_args(int i, int argc, char *argv[]);
 #define get_seed(x) (ee_s16)get_seed_args(x,argc,argv)
 #define get_seed_32(x) get_seed_args(x,argc,argv)
 #else /* via function or volatile */
 ee_s32 get_seed_32(int i);
 #define get_seed(x) (ee_s16)get_seed_32(x)
 #endif
 #if (MEM_METHOD==MEM_STATIC)
 ee_u8 static_memblk[TOTAL_DATA_SIZE];
 #endif
 char *mem_name[3] = {"Static","Heap","Stack"};
 /* Function: main
 	Main entry routine for the benchmark.
 	This function is responsible for the following steps:
 	1 - Initialize input seeds from a source that cannot be determined at compile time.
 	2 - Initialize memory block for use.
 	3 - Run and time the benchmark.
 	4 - Report results, testing the validity of the output if the seeds are known.
 	Arguments:
 	1 - first seed  : Any value
 	2 - second seed : Must be identical to first for iterations to be identical
 	3 - third seed  : Any value, should be at least an order of magnitude less then the input size, but bigger then 32.
 	4 - Iterations  : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs
 */
 #if MAIN_HAS_NOARGC
 MAIN_RETURN_TYPE main(void) {
 	int argc=0;
 	char *argv[1];
 #else
 MAIN_RETURN_TYPE main(int argc, char *argv[]) {
 #endif
 	ee_u16 i,j=0,num_algorithms=0;
 	ee_s16 known_id=-1,total_errors=0;
 	ee_u16 seedcrc=0;
 	CORE_TICKS total_time;
 	core_results results[MULTITHREAD];
 #if (MEM_METHOD==MEM_STACK)
 	ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD];
 #endif
  ioe_init();
  ee_printf("Running CoreMark for %d iterations\n", ITERATIONS);
 	/* first call any initializations needed */
 	portable_init(&(results[0].port), &argc, argv);
 	/* First some checks to make sure benchmark will run ok */
 	if (sizeof(struct list_head_s)>128) {
 		ee_printf("list_head structure too big for comparable data!\n");
 		return MAIN_RETURN_VAL;
 	}
 	results[0].seed1=get_seed(1);
 	results[0].seed2=get_seed(2);
 	results[0].seed3=get_seed(3);
 	results[0].iterations=get_seed_32(4);
 #if CORE_DEBUG
 	results[0].iterations=1;
 #endif
 	results[0].execs=get_seed_32(5);
 	if (results[0].execs==0) { /* if not supplied, execute all algorithms */
 		results[0].execs=ALL_ALGORITHMS_MASK;
 	}
 		/* put in some default values based on one seed only for easy testing */
 	if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */
 		results[0].seed1=0;
 		results[0].seed2=0;
 		results[0].seed3=0x66;
 	}
 	if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */
 		results[0].seed1=0x3415;
 		results[0].seed2=0x3415;
 		results[0].seed3=0x66;
 	}
 #if (MEM_METHOD==MEM_STATIC)
 	results[0].memblock[0]=(void *)static_memblk;
 	results[0].size=TOTAL_DATA_SIZE;
 	results[0].err=0;
 	#if (MULTITHREAD>1)
 	#error "Cannot use a static data area with multiple contexts!"
 	#endif
 #elif (MEM_METHOD==MEM_MALLOC)
 	for (i=0 ; i<MULTITHREAD; i++) {
 		ee_s32 malloc_override=get_seed(7);
 		if (malloc_override != 0)
 			results[i].size=malloc_override;
 		else
 			results[i].size=TOTAL_DATA_SIZE;
 		results[i].memblock[0]=portable_malloc(results[i].size);
 		results[i].seed1=results[0].seed1;
 		results[i].seed2=results[0].seed2;
 		results[i].seed3=results[0].seed3;
 		results[i].err=0;
 		results[i].execs=results[0].execs;
 	}
 #elif (MEM_METHOD==MEM_STACK)
 	for (i=0 ; i<MULTITHREAD; i++) {
 		results[i].memblock[0]=stack_memblock+i*TOTAL_DATA_SIZE;
 		results[i].size=TOTAL_DATA_SIZE;
 		results[i].seed1=results[0].seed1;
 		results[i].seed2=results[0].seed2;
 		results[i].seed3=results[0].seed3;
 		results[i].err=0;
 		results[i].execs=results[0].execs;
 	}
 #else
 #error "Please define a way to initialize a memory block."
 #endif
 	/* Data init */
 	/* Find out how space much we have based on number of algorithms */
 	for (i=0; i<NUM_ALGORITHMS; i++) {
 		if ((1<<(ee_u32)i) & results[0].execs)
 			num_algorithms++;
 	}
 	for (i=0 ; i<MULTITHREAD; i++)
 		results[i].size=results[i].size/num_algorithms;
 	/* Assign pointers */
 	for (i=0; i<NUM_ALGORITHMS; i++) {
 		ee_u32 ctx;
 		if ((1<<(ee_u32)i) & results[0].execs) {
 			for (ctx=0 ; ctx<MULTITHREAD; ctx++)
 				results[ctx].memblock[i+1]=(char *)(results[ctx].memblock[0])+results[0].size*j;
 			j++;
 		}
 	}
 	/* call inits */
 	for (i=0 ; i<MULTITHREAD; i++) {
 		if (results[i].execs & ID_LIST) {
 			results[i].list=core_list_init(results[0].size,results[i].memblock[1],results[i].seed1);
 		}
 		if (results[i].execs & ID_MATRIX) {
 			core_init_matrix(results[0].size, results[i].memblock[2], (ee_s32)results[i].seed1 | (((ee_s32)results[i].seed2) << 16), &(results[i].mat) );
 		}
 		if (results[i].execs & ID_STATE) {
 			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
 		}
 	}
 	/* automatically determine number of iterations if not set */
 	if (results[0].iterations==0) {
 		secs_ret secs_passed=0;
 		ee_u32 divisor;
 		results[0].iterations=1;
 		while (secs_passed < (secs_ret)1) {
 			results[0].iterations*=10;
 			start_time();
 			iterate(&results[0]);
 			stop_time();
 			secs_passed=time_in_secs(get_time());
 		}
 		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
 		divisor=(ee_u32)secs_passed;
 		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
 			divisor=1;
 		results[0].iterations*=1+10/divisor;
 	}
 	/* perform actual benchmark */
 	start_time();
 #if (MULTITHREAD>1)
 	if (default_num_contexts>MULTITHREAD) {
 		default_num_contexts=MULTITHREAD;
 	}
 	for (i=0 ; i<default_num_contexts; i++) {
 		results[i].iterations=results[0].iterations;
 		results[i].execs=results[0].execs;
 		core_start_parallel(&results[i]);
 	}
 	for (i=0 ; i<default_num_contexts; i++) {
 		core_stop_parallel(&results[i]);
 	}
 #else
 	iterate(&results[0]);
 #endif
 	stop_time();
 	total_time=get_time();
 	/* get a function of the input to report */
 	seedcrc=crc16(results[0].seed1,seedcrc);
 	seedcrc=crc16(results[0].seed2,seedcrc);
 	seedcrc=crc16(results[0].seed3,seedcrc);
 	seedcrc=crc16(results[0].size,seedcrc);
 	switch (seedcrc) { /* test known output for common seeds */
 		case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
 			known_id=0;
 			ee_printf("6k performance run parameters for coremark.\n");
 			break;
 		case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per algorithm */
 			known_id=1;
 			ee_printf("6k validation run parameters for coremark.\n");
 			break;
 		case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm */
 			known_id=2;
 			ee_printf("Profile generation run parameters for coremark.\n");
 			break;
 		case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
 			known_id=3;
 			ee_printf("2K performance run parameters for coremark.\n");
 			break;
 		case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per algorithm */
 			known_id=4;
 			ee_printf("2K validation run parameters for coremark.\n");
 			break;
 		default:
 			total_errors=-1;
 			break;
 	}
 	if (known_id>=0) {
 		for (i=0 ; i<default_num_contexts; i++) {
 			results[i].err=0;
 			if ((results[i].execs & ID_LIST) &&
 				(results[i].crclist!=list_known_crc[known_id])) {
 				ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",i,results[i].crclist,list_known_crc[known_id]);
 				results[i].err++;
 			}
 			if ((results[i].execs & ID_MATRIX) &&
 				(results[i].crcmatrix!=matrix_known_crc[known_id])) {
 				ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",i,results[i].crcmatrix,matrix_known_crc[known_id]);
 				results[i].err++;
 			}
 			if ((results[i].execs & ID_STATE) &&
 				(results[i].crcstate!=state_known_crc[known_id])) {
 				ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",i,results[i].crcstate,state_known_crc[known_id]);
 				results[i].err++;
 			}
 			total_errors+=results[i].err;
 		}
 	}
 	total_errors+=check_data_types();
 	/* and report results */
 	ee_printf("CoreMark Size    : %d\n",(int)results[0].size);
 #if HAS_FLOAT
 	ee_printf("Total time (ms)  : %f\n",time_in_secs(total_time));
 	if (time_in_secs(total_time) > 0)
 		ee_printf("Iterations/mSec  : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
 #else
 	ee_printf("Total time (ms)  : %d\n",time_in_secs(total_time));
 #endif
 	ee_printf("Iterations       : %d\n",(int)default_num_contexts*results[0].iterations);
 	ee_printf("Compiler version : %s\n",COMPILER_VERSION);
 #if (MULTITHREAD>1)
 	ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts);
 #endif
 	/* output for verification */
 	ee_printf("seedcrc          : 0x%04x\n",seedcrc);
 	if (results[0].execs & ID_LIST)
 		for (i=0 ; i<default_num_contexts; i++)
 			ee_printf("[%d]crclist       : 0x%04x\n",i,results[i].crclist);
 	if (results[0].execs & ID_MATRIX)
 		for (i=0 ; i<default_num_contexts; i++)
 			ee_printf("[%d]crcmatrix     : 0x%04x\n",i,results[i].crcmatrix);
 	if (results[0].execs & ID_STATE)
 		for (i=0 ; i<default_num_contexts; i++)
 			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
 	for (i=0 ; i<default_num_contexts; i++)
 		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
  ee_printf("Finised in %d ms.\n", (int)total_time);
 	if (total_errors==0) {
    ee_printf("==================================================\n");
 	  ee_printf("CoreMark PASS       %d Marks\n", 2921400 / time_in_secs(total_time) * ITERATIONS / 1000);
 	  ee_printf("                vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
  }
 	if (total_errors>0)
 		ee_printf("Errors detected\n");
 	if (total_errors<0)
 		ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n");
 #if (MEM_METHOD==MEM_MALLOC)
 	for (i=0 ; i<MULTITHREAD; i++)
 		portable_free(results[i].memblock[0]);
 #endif
 	/* And last call any target specific code for finalizing */
 	portable_fini(&(results[0].port));
 	return MAIN_RETURN_VAL;
 }
--- a/coremark/src/core_matrix.c
+++ b/coremark/src/core_matrix.c
@ -0,0 +1,308 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 #include "coremark.h"
 /*
 Topic: Description
 	Matrix manipulation benchmark
 	This very simple algorithm forms the basis of many more complex algorithms.
 	The tight inner loop is the focus of many optimizations (compiler as well as hardware based)
 	and is thus relevant for embedded processing.
 	The total available data space will be divided to 3 parts:
 	NxN Matrix A - initialized with small values (upper 3/4 of the bits all zero).
 	NxN Matrix B - initialized with medium values (upper half of the bits all zero).
 	NxN Matrix C - used for the result.
 	The actual values for A and B must be derived based on input that is not available at compile time.
 */
 ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
 ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
 void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
 void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
 void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
 void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
 void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
 #define matrix_test_next(x) (x+1)
 #define matrix_clip(x,y) ((y) ? (x) & 0x0ff : (x) & 0x0ffff)
 #define matrix_big(x) (0xf000 | (x))
 #define bit_extract(x,from,to) (((x)>>(from)) & (~(0xffffffff << (to))))
 #if CORE_DEBUG
 void printmat(MATDAT *A, ee_u32 N, char *name) {
 	ee_u32 i,j;
 	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			if (j!=0)
 				ee_printf(",");
 			ee_printf("%d",A[i*N+j]);
 		}
 		ee_printf("\n");
 	}
 }
 void printmatC(MATRES *C, ee_u32 N, char *name) {
 	ee_u32 i,j;
 	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			if (j!=0)
 				ee_printf(",");
 			ee_printf("%d",C[i*N+j]);
 		}
 		ee_printf("\n");
 	}
 }
 #endif
 /* Function: core_bench_matrix
 	Benchmark function
 	Iterate <matrix_test> N times,
 	changing the matrix values slightly by a constant amount each time.
 */
 ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) {
 	ee_u32 N=p->N;
 	MATRES *C=p->C;
 	MATDAT *A=p->A;
 	MATDAT *B=p->B;
 	MATDAT val=(MATDAT)seed;
 	crc=crc16(matrix_test(N,C,A,B,val),crc);
 	return crc;
 }
 /* Function: matrix_test
 	Perform matrix manipulation.
 	Parameters:
 	N - Dimensions of the matrix.
 	C - memory for result matrix.
 	A - input matrix
 	B - operator matrix (not changed during operations)
 	Returns:
 	A CRC value that captures all results calculated in the function.
 	In particular, crc of the value calculated on the result matrix
 	after each step by <matrix_sum>.
 	Operation:
 	1 - Add a constant value to all elements of a matrix.
 	2 - Multiply a matrix by a constant.
 	3 - Multiply a matrix by a vector.
 	4 - Multiply a matrix by a matrix.
 	5 - Add a constant value to all elements of a matrix.
 	After the last step, matrix A is back to original contents.
 */
 ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) {
 	ee_u16 crc=0;
 	MATDAT clipval=matrix_big(val);
 	matrix_add_const(N,A,val); /* make sure data changes  */
 #if CORE_DEBUG
 	printmat(A,N,"matrix_add_const");
 #endif
 	matrix_mul_const(N,C,A,val);
 	crc=crc16(matrix_sum(N,C,clipval),crc);
 #if CORE_DEBUG
 	printmatC(C,N,"matrix_mul_const");
 #endif
 	matrix_mul_vect(N,C,A,B);
 	crc=crc16(matrix_sum(N,C,clipval),crc);
 #if CORE_DEBUG
 	printmatC(C,N,"matrix_mul_vect");
 #endif
 	matrix_mul_matrix(N,C,A,B);
 	crc=crc16(matrix_sum(N,C,clipval),crc);
 #if CORE_DEBUG
 	printmatC(C,N,"matrix_mul_matrix");
 #endif
 	matrix_mul_matrix_bitextract(N,C,A,B);
 	crc=crc16(matrix_sum(N,C,clipval),crc);
 #if CORE_DEBUG
 	printmatC(C,N,"matrix_mul_matrix_bitextract");
 #endif
 	matrix_add_const(N,A,-val); /* return matrix to initial value */
 	return crc;
 }
 /* Function : matrix_init
 	Initialize the memory block for matrix benchmarking.
 	Parameters:
 	blksize - Size of memory to be initialized.
 	memblk - Pointer to memory block.
 	seed - Actual values chosen depend on the seed parameter.
 	p - pointers to <mat_params> containing initialized matrixes.
 	Returns:
 	Matrix dimensions.
 	Note:
 	The seed parameter MUST be supplied from a source that cannot be determined at compile time
 */
 ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) {
 	ee_u32 N=0;
 	MATDAT *A;
 	MATDAT *B;
 	ee_s32 order=1;
 	MATDAT val;
 	ee_u32 i=0,j=0;
 	if (seed==0)
 		seed=1;
 	while (j<blksize) {
 		i++;
 		j=i*i*2*4;
 	}
 	N=i-1;
 	A=(MATDAT *)align_mem(memblk);
 	B=A+N*N;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			seed = ( ( order * seed ) % 65536 );
 			val = (seed + order);
 			val=matrix_clip(val,0);
 			B[i*N+j] = val;
 			val =  (val + order);
 			val=matrix_clip(val,1);
 			A[i*N+j] = val;
 			order++;
 		}
 	}
 	p->A=A;
 	p->B=B;
 	p->C=(MATRES *)align_mem(B+N*N);
 	p->N=N;
 #if CORE_DEBUG
 	printmat(A,N,"A");
 	printmat(B,N,"B");
 #endif
 	return N;
 }
 /* Function: matrix_sum
 	Calculate a function that depends on the values of elements in the matrix.
 	For each element, accumulate into a temporary variable.
 	As long as this value is under the parameter clipval,
 	add 1 to the result if the element is bigger then the previous.
 	Otherwise, reset the accumulator and add 10 to the result.
 */
 ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) {
 	MATRES tmp=0,prev=0,cur=0;
 	ee_s16 ret=0;
 	ee_u32 i,j;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			cur=C[i*N+j];
 			tmp+=cur;
 			if (tmp>clipval) {
 				ret+=10;
 				tmp=0;
 			} else {
 				ret += (cur>prev) ? 1 : 0;
 			}
 			prev=cur;
 		}
 	}
 	return ret;
 }
 /* Function: matrix_mul_const
 	Multiply a matrix by a constant.
 	This could be used as a scaler for instance.
 */
 void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) {
 	ee_u32 i,j;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			C[i*N+j]=(MATRES)A[i*N+j] * (MATRES)val;
 		}
 	}
 }
 /* Function: matrix_add_const
 	Add a constant value to all elements of a matrix.
 */
 void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val) {
 	ee_u32 i,j;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			A[i*N+j] += val;
 		}
 	}
 }
 /* Function: matrix_mul_vect
 	Multiply a matrix by a vector.
 	This is common in many simple filters (e.g. fir where a vector of coefficients is applied to the matrix.)
 */
 void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
 	ee_u32 i,j;
 	for (i=0; i<N; i++) {
 		C[i]=0;
 		for (j=0; j<N; j++) {
 			C[i]+=(MATRES)A[i*N+j] * (MATRES)B[j];
 		}
 	}
 }
 /* Function: matrix_mul_matrix
 	Multiply a matrix by a matrix.
 	Basic code is used in many algorithms, mostly with minor changes such as scaling.
 */
 void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
 	ee_u32 i,j,k;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			C[i*N+j]=0;
 			for(k=0;k<N;k++)
 			{
 				C[i*N+j]+=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
 			}
 		}
 	}
 }
 /* Function: matrix_mul_matrix_bitextract
 	Multiply a matrix by a matrix, and extract some bits from the result.
 	Basic code is used in many algorithms, mostly with minor changes such as scaling.
 */
 void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
 	ee_u32 i,j,k;
 	for (i=0; i<N; i++) {
 		for (j=0; j<N; j++) {
 			C[i*N+j]=0;
 			for(k=0;k<N;k++)
 			{
 				MATRES tmp=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
 				C[i*N+j]+=bit_extract(tmp,2,4)*bit_extract(tmp,5,7);
 			}
 		}
 	}
 }
--- a/coremark/src/core_portme.c
+++ b/coremark/src/core_portme.c
@ -0,0 +1,109 @@
 #include "coremark.h"
 #if VALIDATION_RUN
 	volatile ee_s32 seed1_volatile=0x3415;
 	volatile ee_s32 seed2_volatile=0x3415;
 	volatile ee_s32 seed3_volatile=0x66;
 #endif
 #if PERFORMANCE_RUN
 	volatile ee_s32 seed1_volatile=0x0;
 	volatile ee_s32 seed2_volatile=0x0;
 	volatile ee_s32 seed3_volatile=0x66;
 #endif
 #if PROFILE_RUN
 	volatile ee_s32 seed1_volatile=0x8;
 	volatile ee_s32 seed2_volatile=0x8;
 	volatile ee_s32 seed3_volatile=0x8;
 #endif
 	volatile ee_s32 seed4_volatile=ITERATIONS;
 	volatile ee_s32 seed5_volatile=0;
 /* Porting : Timing functions
 	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
 	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc.
 	Sample implementation for standard time.h and windows.h definitions included.
 */
 /* Define : TIMER_RES_DIVIDER
 	Divider to trade off timer resolution and total time that can be measured.
 	Use lower values to increase resolution, but make sure that overflow does not occur.
 	If there are issues with the return value overflowing, increase this value.
 	*/
 #define NSECS_PER_SEC CLOCKS_PER_SEC
 #define CORETIMETYPE clock_t
 #define GETMYTIME(_t) (*_t=clock())
 #define MYTIMEDIFF(fin,ini) ((fin)-(ini))
 #define TIMER_RES_DIVIDER 1
 #define SAMPLE_TIME_IMPLEMENTATION 1
 #define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
 static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
 /** Define Host specific (POSIX), or target specific global time variables. */
 unsigned long start_time_val, stop_time_val;
 /* Function : start_time
 	This function will be called right before starting the timed portion of the benchmark.
 	Implementation may be capturing a system timer (as implemented in the example code)
 	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
 */
 void start_time(void) {
  start_time_val = uptime_ms();
 }
 /* Function : stop_time
 	This function will be called right after ending the timed portion of the benchmark.
 	Implementation may be capturing a system timer (as implemented in the example code)
 	or other system parameters - e.g. reading the current value of cpu cycles counter.
 */
 void stop_time(void) {
  stop_time_val = uptime_ms();
 }
 /* Function : get_time
 	Return an abstract "ticks" number that signifies time on the system.
 	Actual value returned may be cpu cycles, milliseconds or any other value,
 	as long as it can be converted to seconds by <time_in_secs>.
 	This methodology is taken to accomodate any hardware or simulated platform.
 	The sample implementation returns millisecs by default,
 	and the resolution is controlled by <TIMER_RES_DIVIDER>
 */
 CORE_TICKS get_time(void) {
  return stop_time_val - start_time_val;
 }
 /* Function : time_in_secs
 	Convert the value returned by get_time to seconds.
 	The <secs_ret> type is used to accomodate systems with no support for floating point.
 	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
 */
 secs_ret time_in_secs(CORE_TICKS ticks) {
  return ticks;
 }
 ee_u32 default_num_contexts=1;
 /* Function : portable_init
 	Target specific initialization code
 	Test for some common mistakes.
 */
 void portable_init(core_portable *p, int *argc, char *argv[])
 {
 	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
 		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
 	}
 	if (sizeof(ee_u32) != 4) {
 		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
 	}
 	p->portable_id=1;
 }
 /* Function : portable_fini
 	Target specific final code
 */
 void portable_fini(core_portable *p)
 {
 	p->portable_id=0;
 }
--- a/coremark/src/core_state.c
+++ b/coremark/src/core_state.c
@ -0,0 +1,277 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 #include "coremark.h"
 /* local functions */
 enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count);
 /*
 Topic: Description
 	Simple state machines like this one are used in many embedded products.
 	For more complex state machines, sometimes a state transition table implementation is used instead,
 	trading speed of direct coding for ease of maintenance.
 	Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour,
 	we are using a small moore machine.
 	In particular, this machine tests type of string input,
 	trying to determine whether the input is a number or something else.
 	(see core_state.png).
 */
 /* Function: core_bench_state
 	Benchmark function
 	Go over the input twice, once direct, and once after introducing some corruption.
 */
 ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
 		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc)
 {
 	ee_u32 final_counts[NUM_CORE_STATES];
 	ee_u32 track_counts[NUM_CORE_STATES];
 	ee_u8 *p=memblock;
 	ee_u32 i;
 #if CORE_DEBUG
 	ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc);
 #endif
 	for (i=0; i<NUM_CORE_STATES; i++) {
 		final_counts[i]=track_counts[i]=0;
 	}
 	/* run the state machine over the input */
 	while (*p!=0) {
 		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
 		final_counts[fstate]++;
 #if CORE_DEBUG
 	ee_printf("%d,",fstate);
 	}
 	ee_printf("\n");
 #else
 	}
 #endif
 	p=memblock;
 	while (p < (memblock+blksize)) { /* insert some corruption */
 		if (*p!=',')
 			*p^=(ee_u8)seed1;
 		p+=step;
 	}
 	p=memblock;
 	/* run the state machine over the input again */
 	while (*p!=0) {
 		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
 		final_counts[fstate]++;
 #if CORE_DEBUG
 	ee_printf("%d,",fstate);
 	}
 	ee_printf("\n");
 #else
 	}
 #endif
 	p=memblock;
 	while (p < (memblock+blksize)) { /* undo corruption is seed1 and seed2 are equal */
 		if (*p!=',')
 			*p^=(ee_u8)seed2;
 		p+=step;
 	}
 	/* end timing */
 	for (i=0; i<NUM_CORE_STATES; i++) {
 		crc=crcu32(final_counts[i],crc);
 		crc=crcu32(track_counts[i],crc);
 	}
 	return crc;
 }
 /* Default initialization patterns */
 static ee_u8 *intpat[4]  ={(ee_u8 *)"5012",(ee_u8 *)"1234",(ee_u8 *)"-874",(ee_u8 *)"+122"};
 static ee_u8 *floatpat[4]={(ee_u8 *)"35.54400",(ee_u8 *)".1234500",(ee_u8 *)"-110.700",(ee_u8 *)"+0.64400"};
 static ee_u8 *scipat[4]  ={(ee_u8 *)"5.500e+3",(ee_u8 *)"-.123e-2",(ee_u8 *)"-87e+832",(ee_u8 *)"+0.6e-12"};
 static ee_u8 *errpat[4]  ={(ee_u8 *)"T0.3e-1F",(ee_u8 *)"-T.T++Tq",(ee_u8 *)"1T3.4e4z",(ee_u8 *)"34.0e-T^"};
 /* Function: core_init_state
 	Initialize the input data for the state machine.
 	Populate the input with several predetermined strings, interspersed.
 	Actual patterns chosen depend on the seed parameter.
 	Note:
 	The seed parameter MUST be supplied from a source that cannot be determined at compile time
 */
 void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p) {
 	ee_u32 total=0,next=0,i;
 	ee_u8 *buf=0;
 #if CORE_DEBUG
 	ee_u8 *start=p;
 	ee_printf("State: %d,%d\n",size,seed);
 #endif
 	size--;
 	next=0;
 	while ((total+next+1)<size) {
 		if (next>0) {
 			for(i=0;i<next;i++)
 				*(p+total+i)=buf[i];
 			*(p+total+i)=',';
 			total+=next+1;
 		}
 		seed++;
 		switch (seed & 0x7) {
 			case 0: /* int */
 			case 1: /* int */
 			case 2: /* int */
 				buf=intpat[(seed>>3) & 0x3];
 				next=4;
 			break;
 			case 3: /* float */
 			case 4: /* float */
 				buf=floatpat[(seed>>3) & 0x3];
 				next=8;
 			break;
 			case 5: /* scientific */
 			case 6: /* scientific */
 				buf=scipat[(seed>>3) & 0x3];
 				next=8;
 			break;
 			case 7: /* invalid */
 				buf=errpat[(seed>>3) & 0x3];
 				next=8;
 			break;
 			default: /* Never happen, just to make some compilers happy */
 			break;
 		}
 	}
 	size++;
 	while (total<size) { /* fill the rest with 0 */
 		*(p+total)=0;
 		total++;
 	}
 #if CORE_DEBUG
 	ee_printf("State Input: %s\n",start);
 #endif
 }
 static ee_u8 ee_isdigit(ee_u8 c) {
 	ee_u8 retval;
 	retval = ((c>='0') & (c<='9')) ? 1 : 0;
 	return retval;
 }
 /* Function: core_state_transition
 	Actual state machine.
 	The state machine will continue scanning until either:
 	1 - an invalid input is detcted.
 	2 - a valid number has been detected.
 	The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid).
 */
 enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) {
 	ee_u8 *str=*instr;
 	ee_u8 NEXT_SYMBOL;
 	enum CORE_STATE state=CORE_START;
 	for( ; *str && state != CORE_INVALID; str++ ) {
 		NEXT_SYMBOL = *str;
 		if (NEXT_SYMBOL==',') /* end of this input */ {
 			str++;
 			break;
 		}
 		switch(state) {
 		case CORE_START:
 			if(ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_INT;
 			}
 			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
 				state = CORE_S1;
 			}
 			else if( NEXT_SYMBOL == '.' ) {
 				state = CORE_FLOAT;
 			}
 			else {
 				state = CORE_INVALID;
 				transition_count[CORE_INVALID]++;
 			}
 			transition_count[CORE_START]++;
 			break;
 		case CORE_S1:
 			if(ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_INT;
 				transition_count[CORE_S1]++;
 			}
 			else if( NEXT_SYMBOL == '.' ) {
 				state = CORE_FLOAT;
 				transition_count[CORE_S1]++;
 			}
 			else {
 				state = CORE_INVALID;
 				transition_count[CORE_S1]++;
 			}
 			break;
 		case CORE_INT:
 			if( NEXT_SYMBOL == '.' ) {
 				state = CORE_FLOAT;
 				transition_count[CORE_INT]++;
 			}
 			else if(!ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_INVALID;
 				transition_count[CORE_INT]++;
 			}
 			break;
 		case CORE_FLOAT:
 			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
 				state = CORE_S2;
 				transition_count[CORE_FLOAT]++;
 			}
 			else if(!ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_INVALID;
 				transition_count[CORE_FLOAT]++;
 			}
 			break;
 		case CORE_S2:
 			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
 				state = CORE_EXPONENT;
 				transition_count[CORE_S2]++;
 			}
 			else {
 				state = CORE_INVALID;
 				transition_count[CORE_S2]++;
 			}
 			break;
 		case CORE_EXPONENT:
 			if(ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_SCIENTIFIC;
 				transition_count[CORE_EXPONENT]++;
 			}
 			else {
 				state = CORE_INVALID;
 				transition_count[CORE_EXPONENT]++;
 			}
 			break;
 		case CORE_SCIENTIFIC:
 			if(!ee_isdigit(NEXT_SYMBOL)) {
 				state = CORE_INVALID;
 				transition_count[CORE_INVALID]++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	*instr=str;
 	return state;
 }
--- a/coremark/src/core_util.c
+++ b/coremark/src/core_util.c
@ -0,0 +1,210 @@
 /*
 Author : Shay Gal-On, EEMBC
 This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
 All rights reserved.
 EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
 CoreMark License that is distributed with the official EEMBC COREMARK Software release.
 If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
 you must discontinue use and download the official release from www.coremark.org.
 Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
 make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
 EEMBC
 4354 Town Center Blvd. Suite 114-200
 El Dorado Hills, CA, 95762
 */
 #include "coremark.h"
 /* Function: get_seed
 	Get a values that cannot be determined at compile time.
 	Since different embedded systems and compilers are used, 3 different methods are provided:
 	1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that
 	reads the value of a volatile variable from memory at run time.
 	Please note, if using this method, you would need to modify core_portme.c to generate training profile.
 	2 - Command line arguments. This is the preferred method if command line arguments are supported.
 	3 - System function. If none of the first 2 methods is available on the platform,
 	a system function which is not a stub can be used.
 	e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions.
 */
 #if (SEED_METHOD==SEED_VOLATILE)
 	extern volatile ee_s32 seed1_volatile;
 	extern volatile ee_s32 seed2_volatile;
 	extern volatile ee_s32 seed3_volatile;
 	extern volatile ee_s32 seed4_volatile;
 	extern volatile ee_s32 seed5_volatile;
 	ee_s32 get_seed_32(int i) {
 		ee_s32 retval;
 		switch (i) {
 			case 1:
 				retval=seed1_volatile;
 				break;
 			case 2:
 				retval=seed2_volatile;
 				break;
 			case 3:
 				retval=seed3_volatile;
 				break;
 			case 4:
 				retval=seed4_volatile;
 				break;
 			case 5:
 				retval=seed5_volatile;
 				break;
 			default:
 				retval=0;
 				break;
 		}
 		return retval;
 	}
 #elif (SEED_METHOD==SEED_ARG)
 ee_s32 parseval(char *valstring) {
 	ee_s32 retval=0;
 	ee_s32 neg=1;
 	int hexmode=0;
 	if (*valstring == '-') {
 		neg=-1;
 		valstring++;
 	}
 	if ((valstring[0] == '0') && (valstring[1] == 'x')) {
 		hexmode=1;
 		valstring+=2;
 	}
 		/* first look for digits */
 	if (hexmode) {
 		while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) {
 			ee_s32 digit=*valstring-'0';
 			if (digit>9)
 				digit=10+*valstring-'a';
 			retval*=16;
 			retval+=digit;
 			valstring++;
 		}
 	} else {
 		while ((*valstring >= '0') && (*valstring <= '9')) {
 			ee_s32 digit=*valstring-'0';
 			retval*=10;
 			retval+=digit;
 			valstring++;
 		}
 	}
 	/* now add qualifiers */
 	if (*valstring=='K')
 		retval*=1024;
 	if (*valstring=='M')
 		retval*=1024*1024;
 	retval*=neg;
 	return retval;
 }
 ee_s32 get_seed_args(int i, int argc, char *argv[]) {
 	if (argc>i)
 		return parseval(argv[i]);
 	return 0;
 }
 #elif (SEED_METHOD==SEED_FUNC)
 /* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */
 ee_s32 get_seed_32(int i) {
 	ee_s32 retval;
 	switch (i) {
 		case 1:
 			retval=portme_sys1();
 			break;
 		case 2:
 			retval=portme_sys2();
 			break;
 		case 3:
 			retval=portme_sys3();
 			break;
 		case 4:
 			retval=portme_sys4();
 			break;
 		case 5:
 			retval=portme_sys5();
 			break;
 		default:
 			retval=0;
 			break;
 	}
 	return retval;
 }
 #endif
 /* Function: crc*
 	Service functions to calculate 16b CRC code.
 */
 ee_u16 crcu8(ee_u8 data, ee_u16 crc )
 {
 	ee_u8 i=0,x16=0,carry=0;
 	for (i = 0; i < 8; i++)
    {
 		x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
 		data >>= 1;
 		if (x16 == 1)
 		{
 		   crc ^= 0x4002;
 		   carry = 1;
 		}
 		else
 			carry = 0;
 		crc >>= 1;
 		if (carry)
 		   crc |= 0x8000;
 		else
 		   crc &= 0x7fff;
    }
 	return crc;
 }
 ee_u16 crcu16(ee_u16 newval, ee_u16 crc) {
 	crc=crcu8( (ee_u8) (newval)				,crc);
 	crc=crcu8( (ee_u8) ((newval)>>8)	,crc);
 	return crc;
 }
 ee_u16 crcu32(ee_u32 newval, ee_u16 crc) {
 	crc=crc16((ee_s16) newval		,crc);
 	crc=crc16((ee_s16) (newval>>16)	,crc);
 	return crc;
 }
 ee_u16 crc16(ee_s16 newval, ee_u16 crc) {
 	return crcu16((ee_u16)newval, crc);
 }
 ee_u8 check_data_types() {
 	ee_u8 retval=0;
 	if (sizeof(ee_u8) != 1) {
 		ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
 		retval++;
 	}
 	if (sizeof(ee_u16) != 2) {
 		ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
 		retval++;
 	}
 	if (sizeof(ee_s16) != 2) {
 		ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
 		retval++;
 	}
 	if (sizeof(ee_s32) != 4) {
 		ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
 		retval++;
 	}
 	if (sizeof(ee_u32) != 4) {
 		ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
 		retval++;
 	}
 	if (sizeof(ee_ptr_int) != sizeof(int *)) {
 		ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
 		retval++;
 	}
 	if (retval>0) {
 		ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
 	}
 	return retval;
 }
--- a/dhrystone/Makefile
+++ b/dhrystone/Makefile
@ -0,0 +1,3 @@
 NAME = dhrystone
 SRCS = dry.c
 include $(AM_HOME)/Makefile
--- a/dhrystone/dry.c
+++ b/dhrystone/dry.c
@ -0,0 +1,950 @@
 /****************** "DHRYSTONE" Benchmark Program ***************************/
 #define Version "C, Version 2.2"
 /*  File:       dhry_1.c (part 2 of 3)
 *  Author:     Reinhold P. Weicker
 *              Siemens Nixdorf, Paderborn/Germany
 *              weicker@specbench.org
 *  Date:       May 25, 1988
 *  Modified:	Steven Pemberton, CWI, Amsterdam; Steven.Pemberton@cwi.nl
 *  Date:       October, 1993; March 1995
 *              Included both files into one source, that gets compiled
 *              in two passes. Made program auto-compiling, and auto-running,
 *              and generally made it much easier to use.
 *
 *              Original Version (in Ada) published in
 *              "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
 *              pp. 1013 - 1030, together with the statistics
 *              on which the distribution of statements etc. is based.
 *
 *              In this C version, the following C library functions are used:
 *              - strcpy, strcmp (inside the measurement loop)
 *              - printf, scanf (outside the measurement loop)
 *              In addition, Berkeley UNIX system calls "times ()" or "time ()"
 *              are used for execution time measurement. For measurements
 *              on other systems, these calls have to be changed.
 *
 *  Collection of Results:
 *              Reinhold Weicker (address see above) and
 *
 *              Rick Richardson
 *              PC Research. Inc.
 *              94 Apple Orchard Drive
 *              Tinton Falls, NJ 07724
 *                      Phone:  (201) 389-8963 (9-17 EST)
 *                      Usenet: ...!uunet!pcrat!rick
 *
 *      Please send results to Rick Richardson and/or Reinhold Weicker.
 *      Complete information should be given on hardware and software used.
 *      Hardware information includes: Machine type, CPU, type and size
 *      of caches; for microprocessors: clock frequency, memory speed
 *      (number of wait states).
 *      Software information includes: Compiler (and runtime library)
 *      manufacturer and version, compilation switches, OS version.
 *      The Operating System version may give an indication about the compiler;
 *      Dhrystone itself performs no OS calls in the measurement loop.
 *
 *      The complete output generated by the program should be mailed
 *      such that at least some checks for correctness can be made.
 *
 ***************************************************************************
 *
 * Defines:     The following "Defines" are possible:
 *      -DREG          (default: Not defined)
 *              As an approximation to what an average C programmer
 *              might do, causes the "register" storage class to be applied
 *              - for local variables, if they are used (dynamically)
 *                five or more times
 *              - for parameters if they are used (dynamically)
 *                six or more times
 *              Note that an optimal "register" strategy is
 *              compiler-dependent, and that "register" declarations
 *              do not necessarily lead to faster execution.
 *      -DNOSTRUCTASSIGN        (default: Not defined)
 *              Define if the C compiler does not support
 *              assignment of structures.
 *      -DNOENUMS               (default: Not defined)
 *              Define if the C compiler does not support
 *              enumeration types.
 *      -DTIMES                 (default)
 *      -DTIME
 *              The "times" function of UNIX (returning process times)
 *              or the "time" function (returning wallclock time)
 *              is used for measurement.
 *              For single user machines, "time ()" is adequate. For
 *              multi-user machines where you cannot get single-user
 *              access, use the "times ()" function. If you have
 *              neither, use a stopwatch in the dead of night.
 *              "printf"s are provided marking the points "Start Timer"
 *              and "Stop Timer". DO NOT use the UNIX "time(1)"
 *              command, as this will measure the total time to
 *              run this program, which will (erroneously) include
 *              the time to allocate storage (malloc) and to perform
 *              the initialization.
 *      -DHZ=nnn
 *              In Berkeley UNIX, the function "times" returns process
 *              time in 1/HZ seconds, with HZ = 60 for most systems.
 *              CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY
 *              A VALUE.
 *
 ***************************************************************************
 *
 *  History:	Version C/2.1 was made for two reasons:
 *
 *	1) There was an obvious need for a common C version of
 *      Dhrystone, since C is at present the most popular system
 *      programming language for the class of processors
 *      (microcomputers, minicomputers) where Dhrystone is used most.
 *      There should be, as far as possible, only one C version of
 *      Dhrystone such that results can be compared without
 *      restrictions. In the past, the C versions distributed
 *      by Rick Richardson (Version 1.1) and by Reinhold Weicker
 *      had small (though not significant) differences.
 *
 *      2) As far as it is possible without changes to the Dhrystone
 *      statistics, optimizing compilers should be prevented from
 *      removing significant statements.
 *
 *      This C version has been developed in cooperation with
 *      Rick Richardson (Tinton Falls, NJ), it incorporates many
 *      ideas from the "Version 1.1" distributed previously by
 *      him over the UNIX network Usenet.
 *      I also thank Chaim Benedelac (National Semiconductor),
 *      David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
 *      Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
 *      for their help with comments on earlier versions of the
 *      benchmark.
 *
 *  Changes:    In the initialization part, this version follows mostly
 *      Rick Richardson's version distributed via Usenet, not the
 *      version distributed earlier via floppy disk by Reinhold Weicker.
 *      As a concession to older compilers, names have been made
 *      unique within the first 8 characters.
 *      Inside the measurement loop, this version follows the
 *      version previously distributed by Reinhold Weicker.
 *
 *      At several places in the benchmark, code has been added,
 *      but within the measurement loop only in branches that
 *      are not executed. The intention is that optimizing compilers
 *      should be prevented from moving code out of the measurement
 *      loop, or from removing code altogether. Since the statements
 *      that are executed within the measurement loop have NOT been
 *      changed, the numbers defining the "Dhrystone distribution"
 *      (distribution of statements, operand types and locality)
 *      still hold. Except for sophisticated optimizing compilers,
 *      execution times for this version should be the same as
 *      for previous versions.
 *
 *      Since it has proven difficult to subtract the time for the
 *      measurement loop overhead in a correct way, the loop check
 *      has been made a part of the benchmark. This does have
 *      an impact - though a very minor one - on the distribution
 *      statistics which have been updated for this version.
 *
 *      All changes within the measurement loop are described
 *      and discussed in the companion paper "Rationale for
 *      Dhrystone version 2".
 *
 *      Because of the self-imposed limitation that the order and
 *      distribution of the executed statements should not be
 *      changed, there are still cases where optimizing compilers
 *      may not generate code for some statements. To a certain
 *      degree, this is unavoidable for small synthetic benchmarks.
 *      Users of the benchmark are advised to check code listings
 *      whether code is generated for all statements of Dhrystone.
 *
 *      Version 2.1 is identical to version 2.0 distributed via
 *      the UNIX network Usenet in March 1988 except that it corrects
 *      some minor deficiencies that were found by users of version 2.0.
 *      The only change within the measurement loop is that a
 *      non-executed "else" part was added to the "if" statement in
 *      Func_3, and a non-executed "else" part removed from Proc_3.
 *
 * Version C/2.2, Steven Pemberton, October 1993
 *	Functionally, identical to version 2.2; the changes are in
 *	how you compile and use it:
 *	- Everything is in one file now, but compiled in 2 passes
 *	- Compile (and run) by running the file through the shell: 'sh dhry.c"
 *	- Uses the system definition of HZ if one can be found
 *	- HZ must be defined, otherwise it won't compile (no defaults here)
 *	- The (uninteresting) output is printed to stderr (dhry2 > /dev/null)
 *	- The number of loops is passed as a parameter, rather than read
 *	  (dhry2 500000)
 *	- If the number of loops is insufficient to get a good result,
 *	  it repeats it with loops*10 until it is enough (rather than just
 *	  stopping)
 *	- Output says which sort of clock it is using, and the HZ value
 *	- You can use -DREG instead of the -DREG=register of previous versions
 *	- Some stylistic cleanups.
 *
 ***************************************************************************
 *
 *  Compilation model and measurement (IMPORTANT):
 *
 *  The following "ground rules" apply for measurements:
 *  - Separate compilation
 *  - No procedure merging
 *  - Otherwise, compiler optimizations are allowed but should be indicated
 *  - Default results are those without register declarations
 *  See the companion paper "Rationale for Dhrystone Version 2" for a more
 *  detailed discussion of these ground rules.
 *
 *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
 *  models ("small", "medium", "large" etc.) should be given if possible,
 *  together with a definition of these models for the compiler system used.
 *
 **************************************************************************
 *
 *  Dhrystone (C version) statistics:
 *
 *  [Comment from the first distribution, updated for version 2.
 *   Note that because of language differences, the numbers are slightly
 *   different from the Ada version.]
 *
 *  The following program contains statements of a high level programming
 *  language (here: C) in a distribution considered representative:
 *
 *    assignments                  52 (51.0 %)
 *    control statements           33 (32.4 %)
 *    procedure, function calls    17 (16.7 %)
 *
 *  103 statements are dynamically executed. The program is balanced with
 *  respect to the three aspects:
 *
 *    - statement type
 *    - operand type
 *    - operand locality
 *         operand global, local, parameter, or constant.
 *
 *  The combination of these three aspects is balanced only approximately.
 *
 *  1. Statement Type:
 *  -----------------             number
 *
 *     V1 = V2                     9
 *       (incl. V1 = F(..)
 *     V = Constant               12
 *     Assignment,                 7
 *       with array element
 *     Assignment,                 6
 *       with record component
 *                                --
 *                                34       34
 *
 *     X = Y +|-|"&&"|"|" Z        5
 *     X = Y +|-|"==" Constant     6
 *     X = X +|- 1                 3
 *     X = Y *|/ Z                 2
 *     X = Expression,             1
 *           two operators
 *     X = Expression,             1
 *           three operators
 *                                --
 *                                18       18
 *
 *     if ....                    14
 *       with "else"      7
 *       without "else"   7
 *           executed        3
 *           not executed    4
 *     for ...                     7  |  counted every time
 *     while ...                   4  |  the loop condition
 *     do ... while                1  |  is evaluated
 *     switch ...                  1
 *     break                       1
 *     declaration with            1
 *       initialization
 *                                --
 *                                34       34
 *
 *     P (...)  procedure call    11
 *       user procedure      10
 *       library procedure    1
 *     X = F (...)
 *             function  call      6
 *       user function        5
 *       library function     1
 *                                --
 *                                17       17
 *                                        ---
 *                                        103
 *
 *    The average number of parameters in procedure or function calls
 *    is 1.82 (not counting the function values aX *
 *
 *  2. Operators
 *  ------------
 *                          number    approximate
 *                                    percentage
 *
 *    Arithmetic             32          50.8
 *
 *       +                     21          33.3
 *       -                      7          11.1
 *       *                      3           4.8
 *       / (int div)            1           1.6
 *
 *    Comparison             27           42.8
 *
 *       ==                     9           14.3
 *       /=                     4            6.3
 *       >                      1            1.6
 *       <                      3            4.8
 *       >=                     1            1.6
 *       <=                     9           14.3
 *
 *    Logic                   4            6.3
 *
 *       && (AND-THEN)          1            1.6
 *       |  (OR)                1            1.6
 *       !  (NOT)               2            3.2
 *
 *                           --          -----
 *                           63          100.1
 *
 *
 *  3. Operand Type (counted once per operand reference):
 *  ---------------
 *                          number    approximate
 *                                    percentage
 *
 *     Integer               175        72.3 %
 *     Character              45        18.6 %
 *     Pointer                12         5.0 %
 *     String30                6         2.5 %
 *     Array                   2         0.8 %
 *     Record                  2         0.8 %
 *                           ---       -------
 *                           242       100.0 %
 *
 *  When there is an access path leading to the final operand (e.g. a record
 *  component), only the final data type on the access path is counted.
 *
 *
 *  4. Operand Locality:
 *  -------------------
 *                                number    approximate
 *                                          percentage
 *
 *     local variable              114        47.1 %
 *     global variable              22         9.1 %
 *     parameter                    45        18.6 %
 *        value                        23         9.5 %
 *        reference                    22         9.1 %
 *     function result               6         2.5 %
 *     constant                     55        22.7 %
 *                                 ---       -------
 *                                 242       100.0 %
 *
 *  The program does not compute anything meaningful, but it is syntactically
 *  and semantically correct. All variables have a value assigned to them
 *  before they are used as a source operand.
 *
 *  There has been no explicit effort to account for the effects of a
 *  cache, or to balance the use of long or short displacements for code or
 *  data.
 *
 ***************************************************************************
 */
 /* Compiler and system dependent definitions: */
 /* variables for time measurement: */
 #include <am.h>
 #include <klib.h>
 #include <klib-macros.h>
 static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
 #define Start_Timer() Begin_Time = uptime_ms()
 #define Stop_Timer()  End_Time   = uptime_ms()
 #define NUMBER_OF_RUNS		500000 /* Default number of runs */
 #define PASS2
 #ifdef  NOSTRUCTASSIGN
 #define structassign(d, s)      memcpy(&(d), &(s), sizeof(d))
 #else
 #define structassign(d, s)      d = s
 #endif
 #ifdef  NOENUM
 #define Ident_1 0
 #define Ident_2 1
 #define Ident_3 2
 #define Ident_4 3
 #define Ident_5 4
  typedef int   Enumeration;
 #else
  typedef       enum    {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
                Enumeration;
 #endif
        /* for boolean and enumeration types in Ada, Pascal */
 /* General definitions: */
 #define Null 0
                /* Value of a Null pointer */
 typedef int     One_Thirty;
 typedef int     One_Fifty;
 typedef char    Capital_Letter;
 typedef int     Boolean;
 typedef char    Str_30 [31];
 typedef int     Arr_1_Dim [50];
 typedef int     Arr_2_Dim [50] [50];
 typedef struct record
    {
    struct record *Ptr_Comp;
    Enumeration    Discr;
    union {
          struct {
                  Enumeration Enum_Comp;
                  int         Int_Comp;
                  char        Str_Comp [31];
                  } var_1;
          struct {
                  Enumeration E_Comp_2;
                  char        Str_2_Comp [31];
                  } var_2;
          struct {
                  char        Ch_1_Comp;
                  char        Ch_2_Comp;
                  } var_3;
          } variant;
      } Rec_Type, *Rec_Pointer;
 /* Global Variables: */
 Rec_Pointer     Ptr_Glob,
                Next_Ptr_Glob;
 int             Int_Glob;
 Boolean         Bool_Glob;
 char            Ch_1_Glob,
                Ch_2_Glob;
 int             Arr_1_Glob [50];
 int             Arr_2_Glob [50] [50];
 Enumeration     Func_1 ();
  /* forward declaration necessary since Enumeration may not simply be int */
 #ifndef REG
        Boolean Reg = false;
 #define REG
        /* REG becomes defined as empty */
        /* i.e. no register variables   */
 #else
        Boolean Reg = true;
 #undef REG
 #define REG register
 #endif
 Boolean		Done;
 long            Begin_Time,
                End_Time,
                User_Time;
 float           Microseconds,
                Dhrystones_Per_Second;
 /* end of variables for time measurement */
 static char memory[1024];
 static char *free_mem = &memory[0];
 static char* myalloc(size_t size) {
  while ((unsigned long)free_mem % 4 != 0) free_mem ++;
  char *ret = free_mem;
  free_mem += size;
  return ret;
 }
 void Proc_6 (Enumeration, Enumeration*);
 void Proc_3 (Rec_Pointer*);
 void Proc_7 (One_Fifty a, One_Fifty b, One_Fifty* c);
 Boolean Func_2 (Str_30, Str_30);
 void Proc_8(Arr_1_Dim, Arr_2_Dim, int, int);
 Boolean Func_3 (Enumeration);
 void Proc_1 (Ptr_Val_Par)
 /******************/
 REG Rec_Pointer Ptr_Val_Par;
    /* executed once */
 {
  REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
                                        /* == Ptr_Glob_Next */
  /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
  /* corresponds to "rename" in Ada, "with" in Pascal           */
  structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
  Ptr_Val_Par->variant.var_1.Int_Comp = 5;
  Next_Record->variant.var_1.Int_Comp
        = Ptr_Val_Par->variant.var_1.Int_Comp;
  Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
  Proc_3 (&Next_Record->Ptr_Comp);
    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
                        == Ptr_Glob->Ptr_Comp */
  if (Next_Record->Discr == Ident_1)
    /* then, executed */
  {
    Next_Record->variant.var_1.Int_Comp = 6;
    Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
           &Next_Record->variant.var_1.Enum_Comp);
    Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
    Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
           &Next_Record->variant.var_1.Int_Comp);
  }
  else /* not executed */
    structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
 } /* Proc_1 */
 void Proc_2 (Int_Par_Ref)
 /******************/
    /* executed once */
    /* *Int_Par_Ref == 1, becomes 4 */
 One_Fifty   *Int_Par_Ref;
 {
  One_Fifty  Int_Loc;
  Enumeration   Enum_Loc;
  Int_Loc = *Int_Par_Ref + 10;
  do /* executed once */
    if (Ch_1_Glob == 'A')
      /* then, executed */
    {
      Int_Loc -= 1;
      *Int_Par_Ref = Int_Loc - Int_Glob;
      Enum_Loc = Ident_1;
    } /* if */
  while (Enum_Loc != Ident_1); /* true */
 } /* Proc_2 */
 void Proc_3 (Ptr_Ref_Par)
 /******************/
    /* executed once */
    /* Ptr_Ref_Par becomes Ptr_Glob */
 Rec_Pointer *Ptr_Ref_Par;
 {
  if (Ptr_Glob != Null)
    /* then, executed */
    *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
  Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
 } /* Proc_3 */
 void Proc_4 () /* without parameters */
 /*******/
    /* executed once */
 {
  Boolean Bool_Loc;
  Bool_Loc = Ch_1_Glob == 'A';
  Bool_Glob = Bool_Loc | Bool_Glob;
  Ch_2_Glob = 'B';
 } /* Proc_4 */
 void Proc_5 () /* without parameters */
 /*******/
    /* executed once */
 {
  Ch_1_Glob = 'A';
  Bool_Glob = false;
 } /* Proc_5 */
        /* Procedure for the assignment of structures,          */
        /* if the C compiler doesn't support this feature       */
 #ifdef  NOSTRUCTASSIGN
 memcpy (d, s, l)
 register char   *d;
 register char   *s;
 register int    l;
 {
        while (l--) *d++ = *s++;
 }
 #endif
 #ifndef REG
 #define REG
        /* REG becomes defined as empty */
        /* i.e. no register variables   */
 #else
 #undef REG
 #define REG register
 #endif
 extern  int     Int_Glob;
 extern  char    Ch_1_Glob;
 void Proc_6 (Enum_Val_Par, Enum_Ref_Par)
 /*********************************/
    /* executed once */
    /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
 Enumeration  Enum_Val_Par;
 Enumeration *Enum_Ref_Par;
 {
  *Enum_Ref_Par = Enum_Val_Par;
  if (! Func_3 (Enum_Val_Par))
    /* then, not executed */
    *Enum_Ref_Par = Ident_4;
  switch (Enum_Val_Par)
  {
    case Ident_1:
      *Enum_Ref_Par = Ident_1;
      break;
    case Ident_2:
      if (Int_Glob > 100)
        /* then */
      *Enum_Ref_Par = Ident_1;
      else *Enum_Ref_Par = Ident_4;
      break;
    case Ident_3: /* executed */
      *Enum_Ref_Par = Ident_2;
      break;
    case Ident_4: break;
    case Ident_5:
      *Enum_Ref_Par = Ident_3;
      break;
  } /* switch */
 } /* Proc_6 */
 void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref)
 {
  One_Fifty Int_Loc;
  Int_Loc = Int_1_Par_Val + 2;
  *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
 } /* Proc_7 */
 void Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
 /*********************************************************************/
    /* executed once      */
    /* Int_Par_Val_1 == 3 */
    /* Int_Par_Val_2 == 7 */
 Arr_1_Dim       Arr_1_Par_Ref;
 Arr_2_Dim       Arr_2_Par_Ref;
 int             Int_1_Par_Val;
 int             Int_2_Par_Val;
 {
  REG One_Fifty Int_Index;
  REG One_Fifty Int_Loc;
  Int_Loc = Int_1_Par_Val + 5;
  Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
  Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
  Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
  for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
    Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
  Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
  Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
  Int_Glob = 5;
 } /* Proc_8 */
 Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
 /*************************************************/
    /* executed three times                                         */
    /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
    /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
    /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
 Capital_Letter   Ch_1_Par_Val;
 Capital_Letter   Ch_2_Par_Val;
 {
  Capital_Letter        Ch_1_Loc;
  Capital_Letter        Ch_2_Loc;
  Ch_1_Loc = Ch_1_Par_Val;
  Ch_2_Loc = Ch_1_Loc;
  if (Ch_2_Loc != Ch_2_Par_Val)
    /* then, executed */
    return (Ident_1);
  else  /* not executed */
  {
    Ch_1_Glob = Ch_1_Loc;
    return (Ident_2);
   }
 } /* Func_1 */
 Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
 /*************************************************/
    /* executed once */
    /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
    /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
 Str_30  Str_1_Par_Ref;
 Str_30  Str_2_Par_Ref;
 {
  REG One_Thirty        Int_Loc;
      Capital_Letter    Ch_Loc;
  Int_Loc = 2;
  while (Int_Loc <= 2) /* loop body executed once */
    if (Func_1 (Str_1_Par_Ref[Int_Loc],
                Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
      /* then, executed */
    {
      Ch_Loc = 'A';
      Int_Loc += 1;
    } /* if, while */
  if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
    /* then, not executed */
    Int_Loc = 7;
  if (Ch_Loc == 'R') {
    /* then, not executed */
    return (true);
  }
  else /* executed */
  {
    if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
    {
      Int_Loc += 7;
      Int_Glob = Int_Loc;
      return (true);
    }
    else /* executed */
      return (false);
  } /* if Ch_Loc */
 } /* Func_2 */
 Boolean Func_3 (Enum_Par_Val)
 /***************************/
    /* executed once        */
    /* Enum_Par_Val == Ident_3 */
 Enumeration Enum_Par_Val;
 {
  Enumeration Enum_Loc;
  Enum_Loc = Enum_Par_Val;
  if (Enum_Loc == Ident_3)
    /* then, executed */
    return (true);
  else /* not executed */
    return (false);
 } /* Func_3 */
 Boolean pass = true;
 Boolean check(int cond) {
  if (!cond) pass = false;
  return cond;
 }
 int main ()
 /*****/
  /* main program, corresponds to procedures        */
  /* Main and Proc_0 in the Ada version             */
 {
        One_Fifty       Int_1_Loc;
  REG   One_Fifty       Int_2_Loc;
        One_Fifty       Int_3_Loc;
  REG   char            Ch_Index;
        Enumeration     Enum_Loc;
        Str_30          Str_1_Loc;
        Str_30          Str_2_Loc;
  REG   int             Run_Index;
  REG   int             Number_Of_Runs;
  ioe_init();
 Number_Of_Runs = NUMBER_OF_RUNS;
  /* Initializations */
  Next_Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
  Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
  Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
  Ptr_Glob->Discr                       = Ident_1;
  Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
  Ptr_Glob->variant.var_1.Int_Comp      = 40;
  strcpy (Ptr_Glob->variant.var_1.Str_Comp,
          "DHRYSTONE PROGRAM, SOME STRING");
  strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
  Arr_2_Glob [8][7] = 10;
        /* Was missing in published program. Without this statement,    */
        /* Arr_2_Glob [8][7] would have an undefined value.             */
        /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */
        /* overflow may occur for this array element.                   */
  printf ("Dhrystone Benchmark, Version %s\n", Version);
  Done = false;
  while (!Done) {
    printf ("Trying %d runs through Dhrystone.\n", Number_Of_Runs);
    /***************/
    /* Start timer */
    /***************/
    Start_Timer();
    for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
    {
      Proc_5();
      Proc_4();
 	/* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
      Int_1_Loc = 2;
      Int_2_Loc = 3;
      strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
      Enum_Loc = Ident_2;
      Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
 	/* Bool_Glob == 1 */
      while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
      {
 	Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
 	  /* Int_3_Loc == 7 */
 	Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
 	  /* Int_3_Loc == 7 */
 	Int_1_Loc += 1;
      } /* while */
 	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
      Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
 	/* Int_Glob == 5 */
      Proc_1 (Ptr_Glob);
      for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
 			       /* loop body executed twice */
      {
 	if (Enum_Loc == Func_1 (Ch_Index, 'C'))
 	    /* then, not executed */
 	  {
 	  Proc_6 (Ident_1, &Enum_Loc);
 	  strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
 	  Int_2_Loc = Run_Index;
 	  Int_Glob = Run_Index;
 	  }
      }
 	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
      Int_2_Loc = Int_2_Loc * Int_1_Loc;
      Int_1_Loc = Int_2_Loc / Int_3_Loc;
      Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
 	/* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
      Proc_2 (&Int_1_Loc);
 	/* Int_1_Loc == 5 */
    } /* loop "for Run_Index" */
    /**************/
    /* Stop timer */
    /**************/
    Stop_Timer();
    User_Time = End_Time - Begin_Time;
    Done = true;
  }
  if (!check(Int_Glob == 5)) {
    printf("Int_Glob:            %d\n", Int_Glob);
    printf("        should be:   %d\n", 5);
  }
  if (!check(Bool_Glob == 1)) {
    printf("Bool_Glob:           %d\n", Bool_Glob);
    printf("        should be:   %d\n", 1);
  }
  if (!check(Ch_1_Glob == 'A')) {
    printf("Ch_1_Glob:           %c\n", Ch_1_Glob);
    printf("        should be:   %c\n", 'A');
  }
  if (!check(Ch_2_Glob == 'B')) {
    printf("Ch_2_Glob:           %c\n", Ch_2_Glob);
    printf("        should be:   %c\n", 'B');
  }
  if (!check(Arr_1_Glob[8] == 7)) {
    printf("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);
    printf("        should be:   %d\n", 7);
  }
  if (!check(Arr_2_Glob[8][7] == Number_Of_Runs + 10)) {
    printf("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);
    printf("        should be:   Number_Of_Runs + 10\n");
  }
  if (!check((int)Ptr_Glob->Discr == 0)) {
    printf("Ptr_Glob->Discr:             %d\n", Ptr_Glob->Discr);
    printf("        should be:   %d\n", 0);
  }
  if (!check(Ptr_Glob->variant.var_1.Enum_Comp == 2)) {
    printf("Ptr_Glob->Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
    printf("        should be:   %d\n", 2);
  }
  if (!check(Ptr_Glob->variant.var_1.Int_Comp == 17)) {
    printf("Ptr_Glob->Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);
    printf("        should be:   %d\n", 17);
  }
  if (!check(strcmp(Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
    printf("Ptr_Glob->Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);
    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
  }
  if (!check((int)Next_Ptr_Glob->Discr == 0)) {
    printf("Next_Ptr_Glob->Discr:             %d\n", Next_Ptr_Glob->Discr);
    printf("        should be:   %d\n", 0);
  }
  if (!check(Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)) {
    printf("Next_Ptr_Glob->Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
    printf("        should be:   %d\n", 1);
  }
  if (!check(Next_Ptr_Glob->variant.var_1.Int_Comp == 18)) {
    printf("Next_Ptr_Glob->Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
    printf("        should be:   %d\n", 18);
  }
  if (!check(strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
    printf("Next_Ptr_Glob->Str_Comp:          %s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
  }
  if (!check(Int_1_Loc == 5)) {
    printf("Int_1_Loc:           %d\n", Int_1_Loc);
    printf("        should be:   %d\n", 5);
  }
  if (!check(Int_2_Loc == 13)) {
    printf("Int_2_Loc:           %d\n", Int_2_Loc);
    printf("        should be:   %d\n", 13);
  }
  if (!check(Int_3_Loc == 7)) {
    printf("Int_3_Loc:           %d\n", Int_3_Loc);
    printf("        should be:   %d\n", 7);
  }
  if (!check(Enum_Loc == 1)) {
    printf("Enum_Loc:            %d\n", Enum_Loc);
    printf("        should be:   %d\n", 1);
  }
  if (!check(strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)) {
    printf("Str_1_Loc:           %s\n", Str_1_Loc);
    printf("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");
  }
  if (!check(strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)) {
    printf("Str_2_Loc:           %s\n", Str_2_Loc);
    printf("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");
  }
  printf ("Finished in %d ms\n", (int)User_Time);
  printf("==================================================\n");
  printf("Dhrystone %s         %d Marks\n", pass ? "PASS" : "FAIL",
      880900 / (int)User_Time * NUMBER_OF_RUNS/ 500000);
  printf("                   vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
  return 0;
 }
--- a/microbench/Makefile
+++ b/microbench/Makefile
@ -0,0 +1,3 @@
 NAME = microbench
 SRCS = $(shell find -L ./src/ -name "*.c" -o -name "*.cc")
 include $(AM_HOME)/Makefile
--- a/microbench/include/benchmark.h
+++ b/microbench/include/benchmark.h
@ -0,0 +1,113 @@
 #ifndef __BENCHMARK_H__
 #define __BENCHMARK_H__
 #include <am.h>
 #include <klib.h>
 #include <klib-macros.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define MB * 1024 * 1024
 #define KB * 1024
 #define REF_CPU    "i7-7700K @ 4.20GHz"
 #define REF_SCORE  100000
 #define REPEAT  1
 //                  size |  heap | time |  checksum
 #define QSORT_S {     100,   1 KB,     0, 0x08467105}
 #define QSORT_M {   30000, 128 KB,     0, 0xa3e99fe4}
 #define QSORT_L {  100000, 640 KB,  5114, 0xed8cff89}
 #define QUEEN_S {       8,   0 KB,     0, 0x0000005c}
 #define QUEEN_M {      11,   0 KB,     0, 0x00000a78}
 #define QUEEN_L {      12,   0 KB,  4707, 0x00003778}
 #define    BF_S {       4,  32 KB,     0, 0xa6f0079e}
 #define    BF_M {      25,  32 KB,     0, 0xa88f8a65}
 #define    BF_L {     180,  32 KB, 23673, 0x9221e2b3}
 #define   FIB_S {       2,   1 KB,     0, 0x7cfeddf0}
 #define   FIB_M {      23,  16 KB,     0, 0x94ad8800}
 #define   FIB_L {      91, 256 KB, 28318, 0xebdc5f80}
 #define SIEVE_S {     100,   1 KB,     0, 0x00000019}
 #define SIEVE_M {  200000,  32 KB,     0, 0x00004640}
 #define SIEVE_L {10000000,   2 MB, 39361, 0x000a2403}
 #define  PZ15_S {       0,   1 KB,     0, 0x00000006}
 #define  PZ15_M {       1, 256 KB,     0, 0x0000b0df}
 #define  PZ15_L {       2,   2 MB,  4486, 0x00068b8c}
 #define DINIC_S {      10,   8 KB,     0, 0x0000019c}
 #define DINIC_M {      80, 512 KB,     0, 0x00004f99}
 #define DINIC_L {     128,   1 MB, 10882, 0x0000c248}
 #define  LZIP_S {     128, 128 KB,     0, 0xe05fc832}
 #define  LZIP_M {   50000,   1 MB,     0, 0xdc93e90c}
 #define  LZIP_L { 1048576,   4 MB,  7593, 0x8d62c81f}
 #define SSORT_S {     100,   4 KB,     0, 0x4c555e09}
 #define SSORT_M {   10000, 512 KB,     0, 0x0db7909b}
 #define SSORT_L {  100000,   4 MB,  4504, 0x4f0ab431}
 #define   MD5_S {     100,   1 KB,     0, 0xf902f28f}
 #define   MD5_M {  200000, 256 KB,     0, 0xd4f9bc6d}
 #define   MD5_L {10000000,  16 MB, 17239, 0x27286a42}
 #define BENCHMARK_LIST(def) \
  def(qsort, "qsort", QSORT_S, QSORT_M, QSORT_L, "Quick sort") \
  def(queen, "queen", QUEEN_S, QUEEN_M, QUEEN_L, "Queen placement") \
  def(   bf,    "bf",    BF_S,    BF_M,    BF_L, "Brainf**k interpreter") \
  def(  fib,   "fib",   FIB_S,   FIB_M,   FIB_L, "Fibonacci number") \
  def(sieve, "sieve", SIEVE_S, SIEVE_M, SIEVE_L, "Eratosthenes sieve") \
  def( 15pz,  "15pz",  PZ15_S,  PZ15_M,  PZ15_L, "A* 15-puzzle search") \
  def(dinic, "dinic", DINIC_S, DINIC_M, DINIC_L, "Dinic's maxflow algorithm") \
  def( lzip,  "lzip",  LZIP_S,  LZIP_M,  LZIP_L, "Lzip compression") \
  def(ssort, "ssort", SSORT_S, SSORT_M, SSORT_L, "Suffix sort") \
  def(  md5,   "md5",   MD5_S,   MD5_M,   MD5_L, "MD5 digest") \
 // Each benchmark will run REPEAT times
 #define DECL(_name, _sname, _s, _m, _l, _desc) \
  void bench_##_name##_prepare(); \
  void bench_##_name##_run(); \
  int bench_##_name##_validate();
 BENCHMARK_LIST(DECL)
 typedef struct Setting {
  int size;
  unsigned long mlim, ref;
  uint32_t checksum;
 } Setting;
 typedef struct Benchmark {
  void (*prepare)();
  void (*run)();
  int (*validate)();
  const char *name, *desc;
  Setting settings[3];
 } Benchmark;
 extern Benchmark *current;
 extern Setting *setting;
 typedef struct Result {
  int pass;
  unsigned long tsc, msec;
 } Result;
 void prepare(Result *res);
 void done(Result *res);
 // memory allocation
 void* bench_alloc(size_t size);
 void bench_free(void *ptr);
 // random number generator
 void bench_srand(uint32_t seed);
 uint32_t bench_rand(); // return a random number between 0..32767
 // checksum
 uint32_t checksum(void *start, void *end);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/microbench/src/15pz/15pz.cc
+++ b/microbench/src/15pz/15pz.cc
@ -0,0 +1,88 @@
 #include <benchmark.h>
 #include "puzzle.h"
 #include "heap.h"
 const int N = 4;
 static int PUZZLE_S[N*N] = {
  1, 2, 3, 4,
  5, 6, 7, 8,
  9, 10, 0, 11,
  13, 14, 15, 12,
 };
 static int PUZZLE_M[N*N] = {
  1, 2, 3, 4,
  5, 6, 7, 8,
  12, 0, 14, 13,
  11, 15, 10, 9,
 };
 static int PUZZLE_L[N*N] = {
  0, 2, 3, 4,
  9, 6, 7, 8,
  5, 11, 10, 12,
  1, 15, 13, 14,
 };
 static int ans;
 extern "C" {
 void bench_15pz_prepare() {
 }
 void bench_15pz_run() {
  N_puzzle<N> puzzle;
  int MAXN;
  switch (setting->size) {
    case 0: puzzle = N_puzzle<N>(PUZZLE_S); MAXN = 10; break;
    case 1: puzzle = N_puzzle<N>(PUZZLE_M); MAXN = 2048; break;
    case 2: puzzle = N_puzzle<N>(PUZZLE_L); MAXN = 16384; break;
    default: assert(0);
  }
  assert(puzzle.solvable());
  auto *heap = (Updatable_heap<N_puzzle<N>> *) bench_alloc(sizeof(Updatable_heap<N_puzzle<N>>));
  heap->init(MAXN);
  heap->push( puzzle, 0 );
  int n = 0;
  ans = -1;
  while( heap->size() != 0 && n != MAXN ) {
    N_puzzle<N> top = heap->pop();
    ++n;
    if ( top == N_puzzle<N>::solution() ) {
      // We are done
      ans = heap->length(top) * n;
      return;
    }
    if ( top.tile_left_possible() ) {
      heap->push( top.tile_left(), heap->length( top ) + 1 );
    }
    if ( top.tile_right_possible() ) {
      heap->push( top.tile_right(), heap->length( top ) + 1 );
    }
    if ( top.tile_up_possible() ) {
      heap->push( top.tile_up(), heap->length( top ) + 1 );
    }
    if ( top.tile_down_possible() ) {
      heap->push( top.tile_down(), heap->length( top ) + 1 );
    }
  }
 }
 int bench_15pz_validate() {
  return (uint32_t)ans == setting->checksum;
 }
 }
--- a/microbench/src/15pz/heap.h
+++ b/microbench/src/15pz/heap.h
@ -0,0 +1,227 @@
 // Author:  Douglas Wilhelm Harder
 // Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
 template <typename T>
 T max(T a, T b) {
  return a > b ? a : b;
 }
 template <typename T>
 class Updatable_heap {
  private:
    int M;
    class Step;
    Step **hash_table;
    Step **heap;
    int heap_size;
    int maximum_heap_size;
    void inline swap( int, int );
    void percolate_down();
    void percolate_up( int );
    Step *pointer( T const & ) const;
  public:
    void init(int m);
    ~Updatable_heap();
    T pop();
    void push( T const &, int );
    int size() const;
    int maximum_size() const;
    int length( T const & ) const;
 };
 template <typename T>
 class Updatable_heap<T>::Step {
  public:
    T element;
    Step *next;
    int heap_index;
    int path_length;
    int path_weight;
    bool visited;
    Step *previous_step;
    void init( T const &, Step *, int, int );
    int length() const;
    int weight() const;
 };
 template <typename T>
 void Updatable_heap<T>::init(int m) {
  M = m;
  heap = (Step **)bench_alloc(sizeof(void *) * M);
  hash_table = (Step **)bench_alloc(sizeof(void *) * (M + 1));
  heap_size = 0;
  maximum_heap_size = 0;
  for ( int i = 0; i < M; ++i ) {
    hash_table[i] = 0;
  }
 }
 template <typename T>
 Updatable_heap<T>::~Updatable_heap() {
  for ( int i = 0; i < M; ++i ) {
    Step *ptr = hash_table[i];
    while ( ptr != 0 ) {
      Step *tmp = ptr;
      ptr = ptr->next;
    }
  }
 }
 template <typename T>
 T Updatable_heap<T>::pop() {
  if ( size() == 0 ) {
    return T();
  }
  T top = heap[1]->element;
  if ( size() == 1 ) {
    heap_size = 0;
  } else {
    assert( size() > 1 );
    heap[1] = heap[size()];
    heap[1]->heap_index = 1;
    --heap_size;
    percolate_down();
  }
  return top;
 }
 template <typename T>
 void inline Updatable_heap<T>::swap( int i, int j ) {
  Step *tmp = heap[j];
  heap[j] = heap[i];
  heap[i] = tmp;
  heap[i]->heap_index = i;
  heap[j]->heap_index = j;
 }
 template <typename T>
 void Updatable_heap<T>::percolate_down() {
  int n = 1;
  while ( 2*n + 1 <= size() ) {
    if ( heap[n]->weight() < heap[2*n]->weight() && heap[n]->weight() < heap[2*n + 1]->weight() ) {
      return;
    }
    if ( heap[2*n]->weight() < heap[2*n + 1]->weight() ) {
      swap( n, 2*n );
      n = 2*n;
    } else {
      assert( heap[2*n]->weight() >= heap[2*n + 1]->weight() );
      swap( n, 2*n + 1 );
      n = 2*n + 1;
    }
  }
  if ( 2*n == size() &&  heap[2*n]->weight() < heap[n]->weight() ) {
    swap( n, 2*n );
  }
 }
 template <typename T>
 void Updatable_heap<T>::percolate_up( int n ) {
  while ( n != 1 ) {
    int parent = n/2;
    if ( heap[parent]->weight() > heap[n]->weight() ) {
      swap( parent, n );
      n = parent;
    } else {
      return;
    }
  }
 }
 template <typename T>
 void Updatable_heap<T>::push( T const &pz, int path_length ) {
  Step *ptr = pointer( pz );
  if ( ptr == 0 ) {
    assert( heap_size <= M );
    ++heap_size;
    Step *ptr = (Step*)bench_alloc(sizeof(Step));
    ptr->init( pz, hash_table[pz.hash() & (M - 1)], size(), path_length );
    hash_table[pz.hash() & (M - 1)] = ptr;
    heap[size()] = ptr;
    percolate_up( size() );
    maximum_heap_size = max( maximum_heap_size, size() );
  } else {
    if ( !ptr->visited ) {
      if ( path_length + ptr->element.lower_bound() < ptr->weight() ) {
        ptr->path_weight = path_length + ptr->element.lower_bound();
        percolate_up( ptr->heap_index );
      }
    }
  }
 }
 template <typename T>
 int Updatable_heap<T>::size() const {
  return heap_size;
 }
 template <typename T>
 int Updatable_heap<T>::maximum_size() const {
  return maximum_heap_size;
 }
 template <typename T>
 int Updatable_heap<T>::length( T const &pz ) const {
  Step *ptr = pointer( pz );
  return ( ptr == 0 ) ? 2147483647 : ptr->length();
 }
 template <typename T>
 typename Updatable_heap<T>::Step *Updatable_heap<T>::pointer( T const &pz ) const {
  for ( Step *ptr = hash_table[pz.hash() & (M - 1)]; ptr != 0; ptr = ptr->next ) {
    if ( ptr->element == pz ) {
      return ptr;
    }
  }
  return 0;
 }
 /****************************************************
 * ************************************************ *
 * *                   Iterator                   * *
 * ************************************************ *
 ****************************************************/
 template <typename T>
 void Updatable_heap<T>::Step::init( T const &pz, Step *n, int hi, int dist ) {
  element = pz;
  next = n;
  heap_index = hi;
  path_length = dist;
  path_weight = dist + element.lower_bound();
  visited = false;
  previous_step = 0;
 }
 template <typename T>
 int Updatable_heap<T>::Step::length() const {
  return path_length;
 }
 template <typename T>
 int Updatable_heap<T>::Step::weight() const {
  return path_weight;
 }
--- a/microbench/src/15pz/puzzle.h
+++ b/microbench/src/15pz/puzzle.h
@ -0,0 +1,475 @@
 // Author:  Douglas Wilhelm Harder
 // Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
 // Url: https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/N_puzzles/
 template <int N>
 class N_puzzle {
  private:
    bool puzzle_valid;
    uint8_t zero_i, zero_j;
    int8_t manhattan_distance;
    int8_t puzzle[N][N];
    int hash_value;
    void determine_hash();
    static int abs( int n ) { return ( n < 0 ) ? -n : n; }
  public:
    N_puzzle();
    N_puzzle( int array[N*N] );
    N_puzzle( N_puzzle const & );
    N_puzzle &operator=( N_puzzle const & );
    bool solvable() const;
    bool valid() const;
    int lower_bound() const;
    unsigned int hash() const;
    bool tile_up_possible() const;
    bool tile_down_possible() const;
    bool tile_left_possible() const;
    bool tile_right_possible() const;
    N_puzzle tile_up() const;
    N_puzzle tile_down() const;
    N_puzzle tile_left() const;
    N_puzzle tile_right() const;
    bool operator==( N_puzzle const & ) const;
    bool operator!=( N_puzzle const & ) const;
    N_puzzle static solution();
 };
 template < int N >
 N_puzzle<N>::N_puzzle():
 puzzle_valid( true ),
 manhattan_distance( 0 ) {
  int array[N*N];
  for ( int i = 0; i < N*N; ++i ) {
    array[i] = i;
  }
  int n = 0;
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      int k = bench_rand() % (N*N - n);
      puzzle[i][j] = array[k];
      if ( array[k] == 0 ) {
        zero_i = i;
        zero_j = j;
      } else {
        manhattan_distance += abs( ((array[k] - 1) / N) - i );
        manhattan_distance += abs( ((array[k] - 1) % N) - j );
      }
      ++n;
      array[k] = array[N*N - n];
    }
  }
  determine_hash();
 }
 template < int N >
 N_puzzle<N>::N_puzzle( int array[N*N] ):
 puzzle_valid( true ),
 manhattan_distance( 0 ) {
  bool check[N*N];
  for ( int i = 0; i < N*N; ++i ) {
    check[i] = false;
  }
  int n = 0;
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      puzzle[i][j] = array[n];
      check[array[n]] = true;
      if ( array[n] == 0 ) {
        zero_i = i;
        zero_j = j;
      } else {
        manhattan_distance += abs( ((array[n] - 1) / N) - i );
        manhattan_distance += abs( ((array[n] - 1) % N) - j );
      }
      ++n;
    }
  }
  for ( int i = 0; i < N*N; ++i ) {
    if ( !check[i] ) {
      puzzle_valid = false;
      return;
    }
  }
  determine_hash();
 }
 /*
 * Determine a hash value for the puzzle.
 */
 template < int N >
 void N_puzzle<N>::determine_hash() {
  hash_value = 0;
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      hash_value = hash_value*1973 + puzzle[i][j];
    }
  }
 }
 template < int N >
 N_puzzle<N>::N_puzzle( N_puzzle const &pz ):
 puzzle_valid( pz.puzzle_valid ),
 zero_i( pz.zero_i ),
 zero_j( pz.zero_j ),
 manhattan_distance( pz.manhattan_distance ),
 hash_value( pz.hash_value ) {
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      puzzle[i][j] = pz.puzzle[i][j];
    }
  }
 }
 template < int N >
 N_puzzle<N> &N_puzzle<N>::operator=( N_puzzle const &rhs ) {
  puzzle_valid = rhs.puzzle_valid;
  zero_i = rhs.zero_i;
  zero_j = rhs.zero_j;
  manhattan_distance = rhs.manhattan_distance;
  hash_value = rhs.hash_value;
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      puzzle[i][j] = rhs.puzzle[i][j];
    }
  }
  return *this;
 }
 /*
 *  Moving a tile up is possible as long as
 *  the blank is not in the last row.
 */
 template <int N>
 bool N_puzzle<N>::tile_up_possible() const {
  return puzzle_valid && (zero_i != N - 1);
 }
 /*
 *  Moving a tile down is possible as long as
 *  the blank is not in the first row.
 */
 template <int N>
 bool N_puzzle<N>::tile_down_possible() const {
  return puzzle_valid && (zero_i != 0);
 }
 /*
 *  Moving a tile left is possible as long as
 *  the blank is not in the last column.
 */
 template <int N>
 bool N_puzzle<N>::tile_left_possible() const {
  return puzzle_valid && (zero_j != N - 1);
 }
 /*
 *  Moving a tile right is possible as long as
 *  the blank is not in the first column.
 */
 template <int N>
 bool N_puzzle<N>::tile_right_possible() const {
  return puzzle_valid && (zero_j != 0);
 }
 template <int N>
 N_puzzle<N> N_puzzle<N>::tile_up() const {
  if ( !puzzle_valid ) {
    return *this;
  }
  N_puzzle result( *this );
  if ( zero_i == N - 1 ) {
    result.puzzle_valid = false;
    return result;
  }
  result.manhattan_distance +=
    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - zero_i ) -
    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - (zero_i + 1) );
  result.puzzle[zero_i][zero_j] = puzzle[zero_i + 1][zero_j];
  ++result.zero_i;
  result.puzzle[result.zero_i][zero_j] = 0;
  result.determine_hash();
  return result;
 }
 template <int N>
 N_puzzle<N> N_puzzle<N>::tile_down() const {
  if ( !puzzle_valid ) {
    return *this;
  }
  N_puzzle result( *this );
  if ( zero_i == 0 ) {
    result.puzzle_valid = false;
    return result;
  }
  result.manhattan_distance +=
    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - zero_i ) -
    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - (zero_i - 1) );
  result.puzzle[zero_i][zero_j] = puzzle[zero_i - 1][zero_j];
  --result.zero_i;
  result.puzzle[result.zero_i][zero_j] = 0;
  result.determine_hash();
  return result;
 }
 template <int N>
 N_puzzle<N> N_puzzle<N>::tile_left() const {
  if ( !puzzle_valid ) {
    return *this;
  }
  N_puzzle result( *this );
  if ( zero_j == N - 1 ) {
    result.puzzle_valid = false;
    return result;
  }
  result.manhattan_distance +=
    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - zero_j ) -
    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - (zero_j + 1) );
  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j + 1];
  ++result.zero_j;
  result.puzzle[zero_i][result.zero_j] = 0;
  result.determine_hash();
  return result;
 }
 template <int N>
 N_puzzle<N> N_puzzle<N>::tile_right() const {
  if ( !puzzle_valid ) {
    return *this;
  }
  N_puzzle result( *this );
  if ( zero_j == 0 ) {
    result.puzzle_valid = false;
    return result;
  }
  result.manhattan_distance +=
    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - zero_j ) -
    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - (zero_j - 1) );
  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j - 1];
  --result.zero_j;
  result.puzzle[zero_i][result.zero_j] = 0;
  result.determine_hash();
  return result;
 }
 /*
 *  Check if the puzzle is solvable:  that is, check the
 *  number of inversions pluse the Manhattan distance of
 *  the black from the lower-right corner.
 *
 *  Run time:   O(n^2)
 *  Memory:     O(n)
 */
 template <int N>
 bool N_puzzle<N>::solvable() const {
  if ( !valid() ) {
    return false;
  }
  int entries[N*N];
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      if ( puzzle[i][j] == 0 ) {
        entries[N*i + j] = N*N;
      } else {
        entries[N*i + j] = puzzle[i][j];
      }
    }
  }
  int parity = 0;
  for ( int i = 0; i < N*N; ++i ) {
    for ( int j = i + 1; j < N*N; ++j ) {
      if ( entries[i] > entries[j] ) {
        ++parity;
      }
    }
  }
  parity += 2*N - 2 - zero_i - zero_j;
  return ( (parity & 1) == 0 );
 }
 template <int N>
 bool N_puzzle<N>::valid() const {
  return puzzle_valid;
 }
 /*
 *  Return either the Manhattan, Hamming, or discrete distance
 *  between the puzzle and the solution.
 */
 template <int N>
 int N_puzzle<N>::lower_bound() const {
  // The Manhattan distance
  return valid() ? manhattan_distance : N*N*N;
  int result = 0;
  int count = 1;
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      if ( puzzle[i][j] != (count % N*N) ) {
        ++result;
      }
      ++count;
    }
  }
  // The Hamming distance, or
  return result;
  // The discrete distance:  converts the A* search to Dijkstra's algorithm
  // return ( result == 0 ) ? 0 : 1;
 }
 /*
 *  puzzle1 == puzzle2
 *
 *  Two puzzles are considered to be equal if their entries
 *  are equal:
 *    If either puzzle is not valid, return false.
 *    If the hash values are different, they are different; return false.
 *    Otherwise, check all entries to see if they are the same.
 */
 template < int N >
 bool N_puzzle<N>::operator==( N_puzzle const &rhs ) const {
  if ( !valid() || !rhs.valid() || hash() != rhs.hash() ) {
    return false;
  }
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
        return false;
      }
    }
  }
  return true;
 }
 /*
 *  puzzle1 != puzzle2
 *
 *  Two puzzles are considered to be unequal if any of the entries
 *  different:
 *    If either puzzle is not valid, return false.
 *    If the hash values are different, they are different; return true.
 *    Otherwise, check all entries to see if they are the same.
 */
 template < int N >
 bool N_puzzle<N>::operator!=( N_puzzle const &rhs ) const {
  if ( !valid() || !rhs.valid() ) {
    return false;
  }
  if ( hash() != rhs.hash() ) {
    return true;
  }
  for ( int i = 0; i < N; ++i ) {
    for ( int j = 0; j < N; ++j ) {
      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
        return true;
      }
    }
  }
  return false;
 }
 /*
 * unsigned int hash() const
 *
 *   Returns the pre-calculated hash value.
 */
 template < int N >
 unsigned int N_puzzle<N>::hash() const {
  return valid() ? hash_value : 0;
 }
 /*
 * N_puzzle<N>  solution()
 *
 *   Returns the correct solution to the N puzzle:
 *
 *       1  2  3         1   2   3   4
 *  3x3: 4  5  6   4x4:  5   6   7   8
 *       7  8            9  10  11  12
 *                      13  14  15
 */
 template <int N>
 N_puzzle<N> N_puzzle<N>::solution() {
  int array[N*N];
  for ( int i = 0; i < N*N - 1; ++i ) {
    array[i] = i + 1;
  }
  array[N*N - 1] = 0;
  return N_puzzle<N>( array );
 }
--- a/microbench/src/bench.c
+++ b/microbench/src/bench.c
@ -0,0 +1,181 @@
 #include <am.h>
 #include <benchmark.h>
 #include <limits.h>
 #include <klib-macros.h>
 Benchmark *current;
 Setting *setting;
 static char *hbrk;
 static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
 // The benchmark list
 #define ENTRY(_name, _sname, _s, _m, _l, _desc) \
  { .prepare = bench_##_name##_prepare, \
    .run = bench_##_name##_run, \
    .validate = bench_##_name##_validate, \
    .name = _sname, \
    .desc = _desc, \
    .settings = {_s, _m, _l}, },
 Benchmark benchmarks[] = {
  BENCHMARK_LIST(ENTRY)
 };
 // Running a benchmark
 static void bench_prepare(Result *res) {
  res->msec = uptime_ms();
 }
 static void bench_reset() {
  hbrk = (void *)ROUNDUP(heap.start, 8);
 }
 static void bench_done(Result *res) {
  res->msec = uptime_ms() - res->msec;
 }
 static const char *bench_check(Benchmark *bench) {
  uintptr_t freesp = (uintptr_t)heap.end - (uintptr_t)heap.start;
  if (freesp < setting->mlim) {
    return "(insufficient memory)";
  }
  return NULL;
 }
 static void run_once(Benchmark *b, Result *res) {
  bench_reset();       // reset malloc state
  current->prepare();  // call bechmark's prepare function
  bench_prepare(res);  // clean everything, start timer
  current->run();      // run it
  bench_done(res);     // collect results
  res->pass = current->validate();
 }
 static unsigned long score(Benchmark *b, unsigned long tsc, unsigned long msec) {
  if (msec == 0) return 0;
  return (REF_SCORE / 1000) * setting->ref / msec;
 }
 int main(const char *args) {
  const char *setting_name = args;
  if (args == NULL || strcmp(args, "") == 0) {
    printf("Empty mainargs. Use \"ref\" by default\n");
    setting_name = "ref";
  }
  int setting_id = -1;
  if      (strcmp(setting_name, "test" ) == 0) setting_id = 0;
  else if (strcmp(setting_name, "train") == 0) setting_id = 1;
  else if (strcmp(setting_name, "ref"  ) == 0) setting_id = 2;
  else {
    printf("Invalid mainargs: \"%s\"; "
           "must be in {test, train, ref}\n", setting_name);
    halt(1);
  }
  ioe_init();
  printf("======= Running MicroBench [input *%s*] =======\n", setting_name);
  unsigned long bench_score = 0;
  int pass = 1;
  uint32_t t0 = uptime_ms();
  for (int i = 0; i < LENGTH(benchmarks); i ++) {
    Benchmark *bench = &benchmarks[i];
    current = bench;
    setting = &bench->settings[setting_id];
    const char *msg = bench_check(bench);
    printf("[%s] %s: ", bench->name, bench->desc);
    if (msg != NULL) {
      printf("Ignored %s\n", msg);
    } else {
      unsigned long msec = ULONG_MAX;
      int succ = 1;
      for (int i = 0; i < REPEAT; i ++) {
        Result res;
        run_once(bench, &res);
        printf(res.pass ? "*" : "X");
        succ &= res.pass;
        if (res.msec < msec) msec = res.msec;
      }
      if (succ) printf(" Passed.");
      else printf(" Failed.");
      pass &= succ;
      unsigned long cur = score(bench, 0, msec);
      printf("\n");
      if (setting_id != 0) {
        printf("  min time: %d ms [%d]\n", (unsigned int)msec, (unsigned int)cur);
      }
      bench_score += cur;
    }
  }
  uint32_t t1 = uptime_ms();
  bench_score /= LENGTH(benchmarks);
  printf("==================================================\n");
  printf("MicroBench %s", pass ? "PASS" : "FAIL");
  if (setting_id == 2) {
    printf("        %d Marks\n", (unsigned int)bench_score);
    printf("                   vs. %d Marks (%s)\n", REF_SCORE, REF_CPU);
  } else {
    printf("\n");
  }
  printf("Total time: %d ms\n", t1 - t0);
  return 0;
 }
 // Libraries
 void* bench_alloc(size_t size) {
  size  = (size_t)ROUNDUP(size, 8);
  char *old = hbrk;
  hbrk += size;
  assert((uintptr_t)heap.start <= (uintptr_t)hbrk && (uintptr_t)hbrk < (uintptr_t)heap.end);
  for (uint64_t *p = (uint64_t *)old; p != (uint64_t *)hbrk; p ++) {
    *p = 0;
  }
  assert((uintptr_t)hbrk - (uintptr_t)heap.start <= setting->mlim);
  return old;
 }
 void bench_free(void *ptr) {
 }
 static uint32_t seed = 1;
 void bench_srand(uint32_t _seed) {
  seed = _seed & 0x7fff;
 }
 uint32_t bench_rand() {
  seed = (seed * (uint32_t)214013L + (uint32_t)2531011L);
  return (seed >> 16) & 0x7fff;
 }
 // FNV hash
 uint32_t checksum(void *start, void *end) {
  const uint32_t x = 16777619;
  uint32_t h1 = 2166136261u;
  for (uint8_t *p = (uint8_t*)start; p + 4 < (uint8_t*)end; p += 4) {
    for (int i = 0; i < 4; i ++) {
      h1 = (h1 ^ p[i]) * x;
    }
  }
  int32_t hash = (uint32_t)h1;
  hash += hash << 13;
  hash ^= hash >> 7;
  hash += hash << 3;
  hash ^= hash >> 17;
  hash += hash << 5;
  return hash;
 }
--- a/microbench/src/bf/bf.c
+++ b/microbench/src/bf/bf.c
@ -0,0 +1,151 @@
 /*
 Brainfuck-C ( http://github.com/kgabis/brainfuck-c )
 Copyright (c) 2012 Krzysztof Gabis
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 #include <benchmark.h>
 static int ARR_SIZE;
 #define CODE            ">>+>>>>>,[>+>>,]>+[--[+<<<-]<[<+>-]<[<[->[<<<+>>>>+<-]<<[>>+>[->]<<[<]" \
                        "<-]>]>>>+<[[-]<[>+<-]<]>[[>>>]+<<<-<[<<[<<<]>>+>[>>>]<-]<<[<<<]>[>>[>>" \
                        ">]<+<<[<<<]>-]]+<<<]+[->>>]>>]>>[.>>>]"
 #define OP_END          0
 #define OP_INC_DP       1
 #define OP_DEC_DP       2
 #define OP_INC_VAL      3
 #define OP_DEC_VAL      4
 #define OP_OUT          5
 #define OP_IN           6
 #define OP_JMP_FWD      7
 #define OP_JMP_BCK      8
 #define SUCCESS         0
 #define FAILURE         1
 #define PROGRAM_SIZE    4096
 #define STACK_SIZE      512
 #define DATA_SIZE       4096
 #define STACK_PUSH(A)   (STACK[SP++] = A)
 #define STACK_POP()     (STACK[--SP])
 #define STACK_EMPTY()   (SP == 0)
 #define STACK_FULL()    (SP == STACK_SIZE)
 struct instruction_t {
  unsigned short operator;
  unsigned short operand;
 };
 static struct instruction_t *PROGRAM;
 static unsigned short *STACK;
 static unsigned int SP;
 static const char *code;
 static char *input;
 static int compile_bf() {
  unsigned short pc = 0, jmp_pc;
  for (; *code; code ++) {
    int c = *code;
    if (pc >= PROGRAM_SIZE) break;
    switch (c) {
      case '>': PROGRAM[pc].operator = OP_INC_DP; break;
      case '<': PROGRAM[pc].operator = OP_DEC_DP; break;
      case '+': PROGRAM[pc].operator = OP_INC_VAL; break;
      case '-': PROGRAM[pc].operator = OP_DEC_VAL; break;
      case '.': PROGRAM[pc].operator = OP_OUT; break;
      case ',': PROGRAM[pc].operator = OP_IN; break;
      case '[':
        PROGRAM[pc].operator = OP_JMP_FWD;
        if (STACK_FULL()) {
          return FAILURE;
        }
        STACK_PUSH(pc);
        break;
      case ']':
        if (STACK_EMPTY()) {
          return FAILURE;
        }
        jmp_pc = STACK_POP();
        PROGRAM[pc].operator = OP_JMP_BCK;
        PROGRAM[pc].operand = jmp_pc;
        PROGRAM[jmp_pc].operand = pc;
        break;
      default: pc--; break;
    }
    pc++;
  }
  if (!STACK_EMPTY() || pc == PROGRAM_SIZE) {
    return FAILURE;
  }
  PROGRAM[pc].operator = OP_END;
  return SUCCESS;
 }
 static unsigned short *data;
 static char *output;
 static int noutput;
 static void execute_bf() {
  unsigned int pc = 0, ptr = 0;
  while (PROGRAM[pc].operator != OP_END && ptr < DATA_SIZE) {
    switch (PROGRAM[pc].operator) {
      case OP_INC_DP: ptr++; break;
      case OP_DEC_DP: ptr--; break;
      case OP_INC_VAL: data[ptr]++; break;
      case OP_DEC_VAL: data[ptr]--; break;
      case OP_OUT: output[noutput ++] = data[ptr]; break;
      case OP_IN: data[ptr] = *(input ++); break;
      case OP_JMP_FWD: if(!data[ptr]) { pc = PROGRAM[pc].operand; } break;
      case OP_JMP_BCK: if(data[ptr]) { pc = PROGRAM[pc].operand; } break;
      default: return;
    }
    pc++;
  }
 }
 void bench_bf_prepare() {
  ARR_SIZE = setting->size;
  SP = 0;
  PROGRAM = bench_alloc(sizeof(PROGRAM[0]) * PROGRAM_SIZE);
  STACK = bench_alloc(sizeof(STACK[0]) * STACK_SIZE);
  data = bench_alloc(sizeof(data[0]) * DATA_SIZE);
  code = CODE;
  input = bench_alloc(ARR_SIZE + 1);
  output = bench_alloc(DATA_SIZE);
  noutput = 0;
  bench_srand(1);
  for (int i = 0; i < ARR_SIZE; i ++) {
    input[i] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"[bench_rand() % 62];
  }
 }
 void bench_bf_run() {
  compile_bf();
  execute_bf();
 }
 int bench_bf_validate() {
  uint32_t cs = checksum(output, output + noutput);
  return noutput == ARR_SIZE && cs == setting->checksum;
 }
--- a/microbench/src/dinic/dinic.cc
+++ b/microbench/src/dinic/dinic.cc
@ -0,0 +1,138 @@
 #include <benchmark.h>
 static int N;
 const int INF = 0x3f3f3f;
 struct Edge {
  int from, to, cap, flow;
  Edge(){}
  Edge(int from, int to, int cap, int flow) {
    this->from = from;
    this->to = to;
    this->cap = cap;
    this->flow = flow;
  }
 };
 template<typename T>
 static inline T min(T x, T y) {
  return x < y ? x : y;
 }
 struct Dinic {
  int n, m, s, t;
  Edge *edges;
  int *head, *nxt, *d, *cur, *queue;
  bool *vis;
  void init(int n) {
    int nold = (n - 2) / 2;
    int maxm = (nold * nold + nold * 2) * 2;
    edges = (Edge *)bench_alloc(sizeof(Edge) * maxm);
    head = (int *)bench_alloc(sizeof(int) * n);
    nxt = (int *)bench_alloc(sizeof(int) * maxm);
    vis = (bool *)bench_alloc(sizeof(bool) * n);
    d = (int *)bench_alloc(sizeof(int) * n);
    cur = (int *)bench_alloc(sizeof(int) * n);
    queue = (int *)bench_alloc(sizeof(int) * n);
    this->n = n;
    for (int i = 0; i < n; i ++) {
      head[i] = -1;
    }
    m = 0;
  }
  void AddEdge(int u, int v, int c) {
    if (c == 0) return;
    edges[m] = Edge(u, v, c, 0);
    nxt[m] = head[u];
    head[u] = m++;
    edges[m] = Edge(v, u, 0, 0);
    nxt[m] = head[v];
    head[v] = m++;
  }
  bool BFS() {
    for (int i = 0; i < n; i ++) vis[i] = 0;
    int qf = 0, qr = 0;
    queue[qr ++] = s;
    d[s] = 0;
    vis[s] = 1;
    while (qf != qr) {
      int x = queue[qf ++];
      for (int i = head[x]; i != -1; i = nxt[i]) {
        Edge& e = edges[i];
        if (!vis[e.to] && e.cap > e.flow) {
          vis[e.to] = 1;
          d[e.to] = d[x] + 1;
          queue[qr ++] = e.to;
        }
      }
    }
    return vis[t];
  }
  int DFS(int x, int a) {
    if (x == t || a == 0) return a;
    int flow = 0, f;
    for (int i = cur[x]; i != -1; i = nxt[i]) {
      Edge& e = edges[i];
      if (d[x] + 1 == d[e.to] && (f = DFS(e.to, min(a, e.cap-e.flow))) > 0) {
        e.flow += f;
        edges[i^1].flow -= f;
        flow += f;
        a -= f;
        if (a == 0) break;
      }
    }
    return flow;
  }
  int Maxflow(int s, int t) {
    this -> s = s; this -> t = t;
    int flow = 0;
    while (BFS()) {
      for (int i = 0; i < n; i++)
        cur[i] = head[i];
      flow += DFS(s, INF);
    }
    return flow;
  }
 };
 extern "C" {
 static Dinic *G;
 static int ans;
 void bench_dinic_prepare() {
  N = setting->size;
  bench_srand(1);
  int s = 2 * N, t = 2 * N + 1;
  G = (Dinic*)bench_alloc(sizeof(Dinic));
  G->init(2 * N + 2);
  for (int i = 0; i < N; i ++)
    for (int j = 0; j < N; j ++) {
      G->AddEdge(i, N + j, bench_rand() % 10);
    }
  for (int i = 0; i < N; i ++) {
    G->AddEdge(s, i, bench_rand() % 1000);
    G->AddEdge(N + i, t, bench_rand() % 1000);
  }
 }
 void bench_dinic_run() {
  ans = G->Maxflow(2 * N, 2 * N + 1);
 }
 int bench_dinic_validate() {
  return (uint32_t)ans == setting->checksum;
 }
 }
--- a/microbench/src/fib/fib.c
+++ b/microbench/src/fib/fib.c
@ -0,0 +1,64 @@
 #include <benchmark.h>
 // f(n) = (f(n-1) + f(n-2) + .. f(n-m)) mod 2^32
 #define N 2147483603
 static int M;
 static void put(uint32_t *m, int i, int j, uint32_t data) {
  m[i * M + j] = data;
 }
 static uint32_t get(uint32_t *m, int i, int j) {
  return m[i * M + j];
 }
 static inline void mult(uint32_t *c, uint32_t *a, uint32_t *b) {
  for (int i = 0; i < M; i ++)
    for (int j = 0; j < M; j ++) {
      put(c, i, j, 0);
      for (int k = 0; k < M; k ++) {
        put(c, i, j, get(c, i, j) + get(a, i, k) * get(b, k, j));
      }
    }
 }
 static inline void assign(uint32_t *a, uint32_t *b) {
  for (int i = 0; i < M; i ++)
    for (int j = 0; j < M; j ++)
      put(a, i, j, get(b, i, j));
 }
 static uint32_t *A, *ans, *T, *tmp;
 void bench_fib_prepare() {
  M = setting->size;
  int sz = sizeof(uint32_t) * M * M;
  A = bench_alloc(sz);
  T = bench_alloc(sz);
  ans = bench_alloc(sz);
  tmp = bench_alloc(sz);
 }
 void bench_fib_run() {
  for (int i = 0; i < M; i ++)
    for (int j = 0; j < M; j ++) {
      uint32_t x = (i == M - 1 || j == i + 1);
      put(A, i, j, x);
      put(T, i, j, x);
      put(ans, i, j, i == j);
    }
  for (int n = N; n > 0; n >>= 1) {
    if (n & 1) {
      mult(tmp, ans, T);
      assign(ans, tmp);
    }
    mult(tmp, T, T);
    assign(T, tmp);
  }
 }
 int bench_fib_validate() {
  return get(ans, M-1, M-1) == setting->checksum;
 }
--- a/microbench/src/lzip/lzip.c
+++ b/microbench/src/lzip/lzip.c
@ -0,0 +1,29 @@
 #include "quicklz.h"
 #include <benchmark.h>
 static int SIZE;
 static qlz_state_compress *state;
 static char *blk;
 static char *compress;
 static int len;
 void bench_lzip_prepare() {
  SIZE = setting->size;
  bench_srand(1);
  state = bench_alloc(sizeof(qlz_state_compress));
  blk = bench_alloc(SIZE);
  compress = bench_alloc(SIZE + 400);
  for (int i = 0; i < SIZE; i ++) {
    blk[i] = 'a' + bench_rand() % 26;
  }
 }
 void bench_lzip_run() {
  len = qlz_compress(blk, compress, SIZE, state);
 }
 int bench_lzip_validate() {
  return checksum(compress, compress + len) == setting->checksum;
 }
--- a/microbench/src/lzip/quicklz.c
+++ b/microbench/src/lzip/quicklz.c
@ -0,0 +1,761 @@
 // Fast data compression library
 // Copyright (C) 2006-2011 Lasse Mikkel Reinhold
 // lar@quicklz.com
 //
 // QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
 // released into public must be open source) or under a commercial license if such
 // has been acquired (see http://www.quicklz.com/order.html). The commercial license
 // does not cover derived or ported versions created by third parties under GPL.
 // 1.5.0 final
 #include "quicklz.h"
 #if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0
 	#error quicklz.c and quicklz.h have different versions
 #endif
 #define MINOFFSET 2
 #define UNCONDITIONAL_MATCHLEN 6
 #define UNCOMPRESSED_END 4
 #define CWORD_LEN 4
 #if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
 	#define OFFSET_BASE source
 	#define CAST (ui32)(size_t)
 #else
 	#define OFFSET_BASE 0
 	#define CAST
 #endif
 int qlz_get_setting(int setting)
 {
 	switch (setting)
 	{
 		case 0: return QLZ_COMPRESSION_LEVEL;
 		case 1: return sizeof(qlz_state_compress);
 		case 2: return sizeof(qlz_state_decompress);
 		case 3: return QLZ_STREAMING_BUFFER;
 #ifdef QLZ_MEMORY_SAFE
 		case 6: return 1;
 #else
 		case 6: return 0;
 #endif
 		case 7: return QLZ_VERSION_MAJOR;
 		case 8: return QLZ_VERSION_MINOR;
 		case 9: return QLZ_VERSION_REVISION;
 	}
 	return -1;
 }
 #if QLZ_COMPRESSION_LEVEL == 1
 static int same(const unsigned char *src, size_t n)
 {
 	while(n > 0 && *(src + n) == *src)
 		n--;
 	return n == 0 ? 1 : 0;
 }
 #endif
 static void reset_table_compress(qlz_state_compress *state)
 {
 	int i;
 	for(i = 0; i < QLZ_HASH_VALUES; i++)
 	{
 #if QLZ_COMPRESSION_LEVEL == 1
 		state->hash[i].offset = 0;
 #else
 		state->hash_counter[i] = 0;
 #endif
 	}
 }
 static void reset_table_decompress(qlz_state_decompress *state)
 {
 	int i;
 	(void)state;
 	(void)i;
 #if QLZ_COMPRESSION_LEVEL == 2
 	for(i = 0; i < QLZ_HASH_VALUES; i++)
 	{
 		state->hash_counter[i] = 0;
 	}
 #endif
 }
 static __inline ui32 hash_func(ui32 i)
 {
 #if QLZ_COMPRESSION_LEVEL == 2
 	return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1);
 #else
 	return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1);
 #endif
 }
 static __inline ui32 fast_read(void const *src, ui32 bytes)
 {
  uint32_t ret = 0;
 	if (bytes >= 1 && bytes <= 4) {
    for (uint32_t i = 0; i < bytes; i ++) {
      ret |= ((uint8_t*)src)[i] << (i * 8);
    }
  }
  return ret;
 }
 static __inline ui32 hashat(const unsigned char *src)
 {
 	ui32 fetch, hash;
 	fetch = fast_read(src, 3);
 	hash = hash_func(fetch);
 	return hash;
 }
 static __inline void fast_write(ui32 f, void *dst, size_t bytes)
 {
  for (size_t i = 0; i != bytes; i ++) {
    ((char*)dst)[i] = ((char*)&f)[i];
  }
 }
 size_t qlz_size_decompressed(const char *source)
 {
 	ui32 n, r;
 	n = (((*source) & 2) == 2) ? 4 : 1;
 	r = fast_read(source + 1 + n, n);
 	r = r & (0xffffffff >> ((4 - n)*8));
 	return r;
 }
 size_t qlz_size_compressed(const char *source)
 {
 	ui32 n, r;
 	n = (((*source) & 2) == 2) ? 4 : 1;
 	r = fast_read(source + 1, n);
 	r = r & (0xffffffff >> ((4 - n)*8));
 	return r;
 }
 size_t qlz_size_header(const char *source)
 {
 	size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1;
 	return n;
 }
 static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n)
 {
  assert(0); // unaligned memory access
 }
 static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s)
 {
 #if QLZ_COMPRESSION_LEVEL == 1
 	ui32 hash;
 	hash = hashat(s);
 	state->hash[hash].offset = s;
 	state->hash_counter[hash] = 1;
 #elif QLZ_COMPRESSION_LEVEL == 2
 	ui32 hash;
 	unsigned char c;
 	hash = hashat(s);
 	c = state->hash_counter[hash];
 	state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s;
 	c++;
 	state->hash_counter[hash] = c;
 #endif
 	(void)state;
 	(void)s;
 }
 #if QLZ_COMPRESSION_LEVEL <= 2
 static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max)
 {
 	while(*lh < max)
 	{
 		(*lh)++;
 		update_hash(state, *lh);
 	}
 }
 #endif
 static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state)
 {
 	const unsigned char *last_byte = source + size - 1;
 	const unsigned char *src = source;
 	unsigned char *cword_ptr = destination;
 	unsigned char *dst = destination + CWORD_LEN;
 	ui32 cword_val = 1U << 31;
 	const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
 	ui32 fetch = 0;
 	unsigned int lits = 0;
 	(void) lits;
 	if(src <= last_matchstart)
 		fetch = fast_read(src, 3);
 	while(src <= last_matchstart)
 	{
 		if ((cword_val & 1) == 1)
 		{
 			// store uncompressed if compression ratio is too low
 			if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5))
 				return 0;
 			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
 			cword_ptr = dst;
 			dst += CWORD_LEN;
 			cword_val = 1U << 31;
 			fetch = fast_read(src, 3);
 		}
 #if QLZ_COMPRESSION_LEVEL == 1
 		{
 			const unsigned char *o;
 			ui32 hash, cached;
 			hash = hash_func(fetch);
 			cached = fetch ^ state->hash[hash].cache;
 			state->hash[hash].cache = fetch;
 			o = state->hash[hash].offset + OFFSET_BASE;
 			state->hash[hash].offset = CAST(src - OFFSET_BASE);
 			if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
 			{
 				if (*(o + 3) != *(src + 3))
 				{
 					hash <<= 4;
 					cword_val = (cword_val >> 1) | (1U << 31);
 					fast_write((3 - 2) | hash, dst, 2);
 					src += 3;
 					dst += 2;
 				}
 				else
 				{
 					const unsigned char *old_src = src;
 					size_t matchlen;
 					hash <<= 4;
 					cword_val = (cword_val >> 1) | (1U << 31);
 					src += 4;
 					if(*(o + (src - old_src)) == *src)
 					{
 						src++;
 						if(*(o + (src - old_src)) == *src)
 						{
 							size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1;
 							size_t remaining = q > 255 ? 255 : q;
 							src++;
 							while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining)
 								src++;
 						}
 					}
 					matchlen = src - old_src;
 					if (matchlen < 18)
 					{
 						fast_write((ui32)(matchlen - 2) | hash, dst, 2);
 						dst += 2;
 					}
 					else
 					{
 						fast_write((ui32)(matchlen << 16) | hash, dst, 3);
 						dst += 3;
 					}
 				}
 				fetch = fast_read(src, 3);
 				lits = 0;
 			}
 			else
 			{
 				lits++;
 				*dst = *src;
 				src++;
 				dst++;
 				cword_val = (cword_val >> 1);
 				fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16);
 			}
 		}
 #elif QLZ_COMPRESSION_LEVEL >= 2
 		{
 			const unsigned char *o, *offset2;
 			ui32 hash, matchlen, k, m, best_k = 0;
 			unsigned char c;
 			size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1);
 			(void)best_k;
 			//hash = hashat(src);
 			fetch = fast_read(src, 3);
 			hash = hash_func(fetch);
 			c = state->hash_counter[hash];
 			offset2 = state->hash[hash].offset[0];
 			if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0)
 			{
 				matchlen = 3;
 				if(*(offset2 + matchlen) == *(src + matchlen))
 				{
 					matchlen = 4;
 					while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining)
 						matchlen++;
 				}
 			}
 			else
 				matchlen = 0;
 			for(k = 1; k < QLZ_POINTERS && c > k; k++)
 			{
 				o = state->hash[hash].offset[k];
 #if QLZ_COMPRESSION_LEVEL == 3
 				if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
 #elif QLZ_COMPRESSION_LEVEL == 2
 				if(*(src + matchlen) == *(o + matchlen)	&& ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
 #endif
 				{
 					m = 3;
 					while(*(o + m) == *(src + m) && m < remaining)
 						m++;
 #if QLZ_COMPRESSION_LEVEL == 3
 					if ((m > matchlen) || (m == matchlen && o > offset2))
 #elif QLZ_COMPRESSION_LEVEL == 2
 					if (m > matchlen)
 #endif
 					{
 						offset2 = o;
 						matchlen = m;
 						best_k = k;
 					}
 				}
 			}
 			o = offset2;
 			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
 			c++;
 			state->hash_counter[hash] = c;
 #if QLZ_COMPRESSION_LEVEL == 3
 			if(matchlen > 2 && src - o < 131071)
 			{
 				ui32 u;
 				size_t offset = src - o;
 				for(u = 1; u < matchlen; u++)
 				{
 					hash = hashat(src + u);
 					c = state->hash_counter[hash]++;
 					state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u;
 				}
 				cword_val = (cword_val >> 1) | (1U << 31);
 				src += matchlen;
 				if(matchlen == 3 && offset <= 63)
 				{
 					*dst = (unsigned char)(offset << 2);
 					dst++;
 				}
 				else if (matchlen == 3 && offset <= 16383)
 				{
 					ui32 f = (ui32)((offset << 2) | 1);
 					fast_write(f, dst, 2);
 					dst += 2;
 				}
 				else if (matchlen <= 18 && offset <= 1023)
 				{
 					ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2;
 					fast_write(f, dst, 2);
 					dst += 2;
 				}
 				else if(matchlen <= 33)
 				{
 					ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3;
 					fast_write(f, dst, 3);
 					dst += 3;
 				}
 				else
 				{
 					ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3;
 					fast_write(f, dst, 4);
 					dst += 4;
 				}
 			}
 			else
 			{
 				*dst = *src;
 				src++;
 				dst++;
 				cword_val = (cword_val >> 1);
 			}
 #elif QLZ_COMPRESSION_LEVEL == 2
 			if(matchlen > 2)
 			{
 				cword_val = (cword_val >> 1) | (1U << 31);
 				src += matchlen;
 				if (matchlen < 10)
 				{
 					ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5);
 					fast_write(f, dst, 2);
 					dst += 2;
 				}
 				else
 				{
 					ui32 f = best_k | (matchlen << 16) | (hash << 5);
 					fast_write(f, dst, 3);
 					dst += 3;
 				}
 			}
 			else
 			{
 				*dst = *src;
 				src++;
 				dst++;
 				cword_val = (cword_val >> 1);
 			}
 #endif
 		}
 #endif
 	}
 	while (src <= last_byte)
 	{
 		if ((cword_val & 1) == 1)
 		{
 			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
 			cword_ptr = dst;
 			dst += CWORD_LEN;
 			cword_val = 1U << 31;
 		}
 #if QLZ_COMPRESSION_LEVEL < 3
 		if (src <= last_byte - 3)
 		{
 #if QLZ_COMPRESSION_LEVEL == 1
 			ui32 hash, fetch;
 			fetch = fast_read(src, 3);
 			hash = hash_func(fetch);
 			state->hash[hash].offset = CAST(src - OFFSET_BASE);
 			state->hash[hash].cache = fetch;
 #elif QLZ_COMPRESSION_LEVEL == 2
 			ui32 hash;
 			unsigned char c;
 			hash = hashat(src);
 			c = state->hash_counter[hash];
 			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
 			c++;
 			state->hash_counter[hash] = c;
 #endif
 		}
 #endif
 		*dst = *src;
 		src++;
 		dst++;
 		cword_val = (cword_val >> 1);
 	}
 	while((cword_val & 1) != 1)
 		cword_val = (cword_val >> 1);
 	fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
 	// min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument
 	return dst - destination < 9 ? 9 : dst - destination;
 }
 static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history)
 {
 	const unsigned char *src = source + qlz_size_header((const char *)source);
 	unsigned char *dst = destination;
 	const unsigned char *last_destination_byte = destination + size - 1;
 	ui32 cword_val = 1;
 	const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
 	unsigned char *last_hashed = destination - 1;
 	const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1;
 	static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
 	(void) last_source_byte;
 	(void) last_hashed;
 	(void) state;
 	(void) history;
 	for(;;)
 	{
 		ui32 fetch;
 		if (cword_val == 1)
 		{
 #ifdef QLZ_MEMORY_SAFE
 			if(src + CWORD_LEN - 1 > last_source_byte)
 				return 0;
 #endif
 			cword_val = fast_read(src, CWORD_LEN);
 			src += CWORD_LEN;
 		}
 #ifdef QLZ_MEMORY_SAFE
 			if(src + 4 - 1 > last_source_byte)
 				return 0;
 #endif
 		fetch = fast_read(src, 4);
 		if ((cword_val & 1) == 1)
 		{
 			ui32 matchlen;
 			const unsigned char *offset2;
 #if QLZ_COMPRESSION_LEVEL == 1
 			ui32 hash;
 			cword_val = cword_val >> 1;
 			hash = (fetch >> 4) & 0xfff;
 			offset2 = (const unsigned char *)(size_t)state->hash[hash].offset;
 			if((fetch & 0xf) != 0)
 			{
 				matchlen = (fetch & 0xf) + 2;
 				src += 2;
 			}
 			else
 			{
 				matchlen = *(src + 2);
 				src += 3;
 			}
 #elif QLZ_COMPRESSION_LEVEL == 2
 			ui32 hash;
 			unsigned char c;
 			cword_val = cword_val >> 1;
 			hash = (fetch >> 5) & 0x7ff;
 			c = (unsigned char)(fetch & 0x3);
 			offset2 = state->hash[hash].offset[c];
 			if((fetch & (28)) != 0)
 			{
 				matchlen = ((fetch >> 2) & 0x7) + 2;
 				src += 2;
 			}
 			else
 			{
 				matchlen = *(src + 2);
 				src += 3;
 			}
 #elif QLZ_COMPRESSION_LEVEL == 3
 			ui32 offset;
 			cword_val = cword_val >> 1;
 			if ((fetch & 3) == 0)
 			{
 				offset = (fetch & 0xff) >> 2;
 				matchlen = 3;
 				src++;
 			}
 			else if ((fetch & 2) == 0)
 			{
 				offset = (fetch & 0xffff) >> 2;
 				matchlen = 3;
 				src += 2;
 			}
 			else if ((fetch & 1) == 0)
 			{
 				offset = (fetch & 0xffff) >> 6;
 				matchlen = ((fetch >> 2) & 15) + 3;
 				src += 2;
 			}
 			else if ((fetch & 127) != 3)
 			{
 				offset = (fetch >> 7) & 0x1ffff;
 				matchlen = ((fetch >> 2) & 0x1f) + 2;
 				src += 3;
 			}
 			else
 			{
 				offset = (fetch >> 15);
 				matchlen = ((fetch >> 7) & 255) + 3;
 				src += 4;
 			}
 			offset2 = dst - offset;
 #endif
 #ifdef QLZ_MEMORY_SAFE
 			if(offset2 < history || offset2 > dst - MINOFFSET - 1)
 				return 0;
 			if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1))
 				return 0;
 #endif
 			memcpy_up(dst, offset2, matchlen);
 			dst += matchlen;
 #if QLZ_COMPRESSION_LEVEL <= 2
 			update_hash_upto(state, &last_hashed, dst - matchlen);
 			last_hashed = dst - 1;
 #endif
 		}
 		else
 		{
 			if (dst < last_matchstart)
 			{
 				unsigned int n = bitlut[cword_val & 0xf];
 				memcpy_up(dst, src, 4);
 				cword_val = cword_val >> n;
 				dst += n;
 				src += n;
 #if QLZ_COMPRESSION_LEVEL <= 2
 				update_hash_upto(state, &last_hashed, dst - 3);
 #endif
 			}
 			else
 			{
 				while(dst <= last_destination_byte)
 				{
 					if (cword_val == 1)
 					{
 						src += CWORD_LEN;
 						cword_val = 1U << 31;
 					}
 #ifdef QLZ_MEMORY_SAFE
 					if(src >= last_source_byte + 1)
 						return 0;
 #endif
 					*dst = *src;
 					dst++;
 					src++;
 					cword_val = cword_val >> 1;
 				}
 #if QLZ_COMPRESSION_LEVEL <= 2
 				update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant
 #endif
 				return size;
 			}
 		}
 	}
 }
 size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state)
 {
 	size_t r;
 	ui32 compressed;
 	size_t base;
 	if(size == 0 || size > 0xffffffff - 400)
 		return 0;
 	if(size < 216)
 		base = 3;
 	else
 		base = 9;
 #if QLZ_STREAMING_BUFFER > 0
 	if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER)
 #endif
 	{
 		reset_table_compress(state);
 		r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state);
 #if QLZ_STREAMING_BUFFER > 0
 		reset_table_compress(state);
 #endif
 		if(r == base)
 		{
 			bench_memcpy(destination + base, source, size);
 			r = size + base;
 			compressed = 0;
 		}
 		else
 		{
 			compressed = 1;
 		}
 		state->stream_counter = 0;
 	}
 #if QLZ_STREAMING_BUFFER > 0
 	else
 	{
 		unsigned char *src = state->stream_buffer + state->stream_counter;
 		bench_memcpy(src, source, size);
 		r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state);
 		if(r == base)
 		{
 			bench_memcpy(destination + base, src, size);
 			r = size + base;
 			compressed = 0;
 			reset_table_compress(state);
 		}
 		else
 		{
 			compressed = 1;
 		}
 		state->stream_counter += size;
 	}
 #endif
 	if(base == 3)
 	{
 		*destination = (unsigned char)(0 | compressed);
 		*(destination + 1) = (unsigned char)r;
 		*(destination + 2) = (unsigned char)size;
 	}
 	else
 	{
 		*destination = (unsigned char)(2 | compressed);
 		fast_write((ui32)r, destination + 1, 4);
 		fast_write((ui32)size, destination + 5, 4);
 	}
 	*destination |= (QLZ_COMPRESSION_LEVEL << 2);
 	*destination |= (1 << 6);
 	*destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4);
 // 76543210
 // 01SSLLHC
 	return r;
 }
 size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state)
 {
 	size_t dsiz = qlz_size_decompressed(source);
 #if QLZ_STREAMING_BUFFER > 0
 	if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER)
 #endif
 	{
 		if((*source & 1) == 1)
 		{
 			reset_table_decompress(state);
 			dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination);
 		}
 		else
 		{
 			bench_memcpy(destination, source + qlz_size_header(source), dsiz);
 		}
 		state->stream_counter = 0;
 		reset_table_decompress(state);
 	}
 #if QLZ_STREAMING_BUFFER > 0
 	else
 	{
 		unsigned char *dst = state->stream_buffer + state->stream_counter;
 		if((*source & 1) == 1)
 		{
 			dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer);
 		}
 		else
 		{
 			bench_memcpy(dst, source + qlz_size_header(source), dsiz);
 			reset_table_decompress(state);
 		}
 		bench_memcpy(destination, dst, dsiz);
 		state->stream_counter += dsiz;
 	}
 #endif
 	return dsiz;
 }
--- a/microbench/src/lzip/quicklz.h
+++ b/microbench/src/lzip/quicklz.h
@ -0,0 +1,164 @@
 #ifndef QLZ_HEADER
 #define QLZ_HEADER
 #include <am.h>
 #include <klib.h>
 static inline void* bench_memcpy(void* dst, const void* src, size_t n){
  assert(dst&&src);
  const char* s;
  char* d;
  if(src+n>dst&&src<dst){
    s=src+n;
    d=dst+n;
    while(n-->0)*--d=*--s;
  }
  else{
    s=src;
    d=dst;
    while(n-->0)*d++=*s++;
  }
  return dst;
 }
 // Fast data compression library
 // Copyright (C) 2006-2011 Lasse Mikkel Reinhold
 // lar@quicklz.com
 //
 // QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
 // released into public must be open source) or under a commercial license if such
 // has been acquired (see http://www.quicklz.com/order.html). The commercial license
 // does not cover derived or ported versions created by third parties under GPL.
 // You can edit following user settings. Data must be decompressed with the same
 // setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed
 // (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially
 // zeroed out (see manual). First #ifndef makes it possible to define settings from
 // the outside like the compiler command line.
 // 1.5.0 final
 #ifndef QLZ_COMPRESSION_LEVEL
 	// 1 gives fastest compression speed. 3 gives fastest decompression speed and best
 	// compression ratio.
 	//#define QLZ_COMPRESSION_LEVEL 1
 	//#define QLZ_COMPRESSION_LEVEL 2
 	//#define QLZ_COMPRESSION_LEVEL 3
 	#define QLZ_COMPRESSION_LEVEL 2
 	// If > 0, zero out both states prior to first call to qlz_compress() or qlz_decompress()
 	// and decompress packets in the same order as they were compressed
 	#define QLZ_STREAMING_BUFFER 0
 	//#define QLZ_STREAMING_BUFFER 100000
 	//#define QLZ_STREAMING_BUFFER 1000000
 	// Guarantees that decompression of corrupted data cannot crash. Decreases decompression
 	// speed 10-20%. Compression speed not affected.
 	//#define QLZ_MEMORY_SAFE
 #endif
 #define QLZ_VERSION_MAJOR 1
 #define QLZ_VERSION_MINOR 5
 #define QLZ_VERSION_REVISION 0
 // Verify compression level
 #if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3
 #error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3
 #endif
 typedef unsigned int ui32;
 typedef unsigned short int ui16;
 // Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values!
 #if QLZ_COMPRESSION_LEVEL == 1
 #define QLZ_POINTERS 1
 #define QLZ_HASH_VALUES 4096
 #elif QLZ_COMPRESSION_LEVEL == 2
 #define QLZ_POINTERS 4
 #define QLZ_HASH_VALUES 2048
 #elif QLZ_COMPRESSION_LEVEL == 3
 #define QLZ_POINTERS 16
 #define QLZ_HASH_VALUES 4096
 #endif
 // hash entry
 typedef struct
 {
 #if QLZ_COMPRESSION_LEVEL == 1
 	ui32 cache;
 #if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
 	unsigned int offset;
 #else
 	const unsigned char *offset;
 #endif
 #else
 	const unsigned char *offset[QLZ_POINTERS];
 #endif
 } qlz_hash_compress;
 typedef struct
 {
 #if QLZ_COMPRESSION_LEVEL == 1
 	const unsigned char *offset;
 #else
 	const unsigned char *offset[QLZ_POINTERS];
 #endif
 } qlz_hash_decompress;
 // states
 typedef struct
 {
 	#if QLZ_STREAMING_BUFFER > 0
 		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
 	#endif
 	size_t stream_counter;
 	qlz_hash_compress hash[QLZ_HASH_VALUES];
 	unsigned char hash_counter[QLZ_HASH_VALUES];
 } qlz_state_compress;
 #if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2
 	typedef struct
 	{
 #if QLZ_STREAMING_BUFFER > 0
 		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
 #endif
 		qlz_hash_decompress hash[QLZ_HASH_VALUES];
 		unsigned char hash_counter[QLZ_HASH_VALUES];
 		size_t stream_counter;
 	} qlz_state_decompress;
 #elif QLZ_COMPRESSION_LEVEL == 3
 	typedef struct
 	{
 #if QLZ_STREAMING_BUFFER > 0
 		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
 #endif
 #if QLZ_COMPRESSION_LEVEL <= 2
 		qlz_hash_decompress hash[QLZ_HASH_VALUES];
 #endif
 		size_t stream_counter;
 	} qlz_state_decompress;
 #endif
 #if defined (__cplusplus)
 extern "C" {
 #endif
 // Public functions of QuickLZ
 size_t qlz_size_decompressed(const char *source);
 size_t qlz_size_compressed(const char *source);
 size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state);
 size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state);
 int qlz_get_setting(int setting);
 #if defined (__cplusplus)
 }
 #endif
 #endif
--- a/microbench/src/md5/md5.c
+++ b/microbench/src/md5/md5.c
@ -0,0 +1,159 @@
 /*
 * Simple MD5 implementation (github.com/pod32g/md5)
 *
 */
 #include <benchmark.h>
 static int N;
 // Constants are the integer part of the sines of integers (in radians) * 2^32.
 const uint32_t k[64] = {
 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee ,
 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 ,
 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be ,
 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 ,
 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa ,
 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 ,
 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed ,
 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a ,
 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c ,
 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 ,
 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 ,
 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 ,
 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 ,
 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 ,
 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 ,
 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
 // r specifies the per-round shift amounts
 static const uint32_t r[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
                 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
                 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
                 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
 // leftrotate function definition
 #define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
 static void to_bytes(uint32_t val, uint8_t *bytes)
 {
    bytes[0] = (uint8_t) val;
    bytes[1] = (uint8_t) (val >> 8);
    bytes[2] = (uint8_t) (val >> 16);
    bytes[3] = (uint8_t) (val >> 24);
 }
 static uint32_t to_int32(const uint8_t *bytes)
 {
    return (uint32_t) bytes[0]
        | ((uint32_t) bytes[1] << 8)
        | ((uint32_t) bytes[2] << 16)
        | ((uint32_t) bytes[3] << 24);
 }
 static void md5(uint8_t *msg, size_t initial_len, uint8_t *digest) {
    // These vars will contain the hash
    uint32_t h0, h1, h2, h3;
    size_t new_len, offset;
    uint32_t w[16];
    uint32_t a, b, c, d, i, f, g, temp;
    // Initialize variables - simple count in nibbles:
    h0 = 0x67452301;
    h1 = 0xefcdab89;
    h2 = 0x98badcfe;
    h3 = 0x10325476;
    //Pre-processing:
    //append "1" bit to message
    //append "0" bits until message length in bits ≡ 448 (mod 512)
    //append length mod (2^64) to message
    for (new_len = initial_len + 1; new_len % (512/8) != 448/8; new_len++)
        ;
    msg[initial_len] = 0x80; // append the "1" bit; most significant bit is "first"
    for (offset = initial_len + 1; offset < new_len; offset++)
        msg[offset] = 0; // append "0" bits
    // append the len in bits at the end of the buffer.
    to_bytes(initial_len*8, msg + new_len);
    // initial_len>>29 == initial_len*8>>32, but avoids overflow.
    to_bytes(initial_len>>29, msg + new_len + 4);
    // Process the message in successive 512-bit chunks:
    //for each 512-bit chunk of message:
    for(offset=0; offset<new_len; offset += (512/8)) {
        // break chunk into sixteen 32-bit words w[j], 0 ≤ j ≤ 15
        for (i = 0; i < 16; i++)
            w[i] = to_int32(msg + offset + i*4);
        // Initialize hash value for this chunk:
        a = h0;
        b = h1;
        c = h2;
        d = h3;
        // Main loop:
        for(i = 0; i<64; i++) {
            if (i < 16) {
                f = (b & c) | ((~b) & d);
                g = i;
            } else if (i < 32) {
                f = (d & b) | ((~d) & c);
                g = (5*i + 1) % 16;
            } else if (i < 48) {
                f = b ^ c ^ d;
                g = (3*i + 5) % 16;
            } else {
                f = c ^ (b | (~d));
                g = (7*i) % 16;
            }
            temp = d;
            d = c;
            c = b;
            b = b + LEFTROTATE((a + f + k[i] + w[g]), r[i]);
            a = temp;
        }
        // Add this chunk's hash to result so far:
        h0 += a;
        h1 += b;
        h2 += c;
        h3 += d;
    }
    //var char digest[16] := h0 append h1 append h2 append h3 //(Output is in little-endian)
    to_bytes(h0, digest);
    to_bytes(h1, digest + 4);
    to_bytes(h2, digest + 8);
    to_bytes(h3, digest + 12);
 }
 static uint8_t *str;
 static uint8_t *digest;
 void bench_md5_prepare() {
  N = setting->size;
  bench_srand(1);
  str = bench_alloc(N);
  for (int i = 0; i < N; i ++) {
    str[i] = bench_rand();
  }
  digest = bench_alloc(16);
 }
 void bench_md5_run() {
  md5(str, N, digest);
 }
 int bench_md5_validate() {
  return checksum(digest, digest + 16) == setting->checksum;
 }
--- a/microbench/src/qsort/qsort.c
+++ b/microbench/src/qsort/qsort.c
@ -0,0 +1,44 @@
 #include <benchmark.h>
 static int N, *data;
 void bench_qsort_prepare() {
  bench_srand(1);
  N = setting->size;
  data = bench_alloc(N * sizeof(int));
  for (int i = 0; i < N; i ++) {
    int a = bench_rand();
    int b = bench_rand();
    data[i] = (a << 16) | b;
  }
 }
 static void swap(int *a, int *b) {
  int t = *a;
  *a = *b;
  *b = t;
 }
 static void myqsort(int *a, int l, int r) {
  if (l < r) {
    int p = a[l], pivot = l, j;
    for (j = l + 1; j < r; j ++) {
      if (a[j] < p) {
        swap(&a[++pivot], &a[j]);
      }
    }
    swap(&a[pivot], &a[l]);
    myqsort(a, l, pivot);
    myqsort(a, pivot + 1, r);
  }
 }
 void bench_qsort_run() {
  myqsort(data, 0, N);
 }
 int bench_qsort_validate() {
  return checksum(data, data + N) == setting->checksum;
 }
--- a/microbench/src/queen/queen.c
+++ b/microbench/src/queen/queen.c
@ -0,0 +1,32 @@
 #include <benchmark.h>
 static unsigned int FULL;
 static unsigned int dfs(unsigned int row, unsigned int ld, unsigned int rd) {
  if (row == FULL) {
    return 1;
  } else {
    unsigned int pos = FULL & (~(row | ld | rd)), ans = 0;
    while (pos) {
      unsigned int p = (pos & (~pos + 1));
      pos -= p;
      ans += dfs(row | p, (ld | p) << 1, (rd | p) >> 1);
    }
    return ans;
  }
 }
 static unsigned int ans;
 void bench_queen_prepare() {
  ans = 0;
  FULL = (1 << setting->size) - 1;
 }
 void bench_queen_run() {
  ans = dfs(0, 0, 0);
 }
 int bench_queen_validate() {
  return ans == setting->checksum;
 }
--- a/microbench/src/sieve/sieve.c
+++ b/microbench/src/sieve/sieve.c
@ -0,0 +1,42 @@
 #include <benchmark.h>
 static int N;
 static int ans;
 static uint32_t *primes;
 static inline int get(int n) {
  return (primes[n >> 5] >> (n & 31)) & 1;
 }
 static inline void clear(int n) {
  primes[n >> 5] &= ~(1ul << (n & 31));
 }
 void bench_sieve_prepare() {
  N = setting->size;
  primes = (uint32_t*)bench_alloc(N / 8 + 128);
  for (int i = 0; i <= N / 32; i ++) {
    primes[i] = 0xffffffff;
  }
 }
 void bench_sieve_run() {
  for (int i = 1; i <= N; i ++)
    if (!get(i)) return;
  for (int i = 2; i * i <= N; i ++) {
    if (get(i)) {
      for (int j = i + i; j <= N; j += i)
        clear(j);
    }
  }
  ans = 0;
  for (int i = 2; i <= N; i ++)
    if (get(i)) {
      ans ++;
    }
 }
 int bench_sieve_validate() {
  return ans == setting->checksum;
 }
--- a/microbench/src/ssort/ssort.cc
+++ b/microbench/src/ssort/ssort.cc
@ -0,0 +1,111 @@
 // This is the Skew algorithm's reference implementation.
 #include <benchmark.h>
 static int N;
 inline bool leq(int a1, int a2,   int b1, int b2) { // lexic. order for pairs
  return(a1 < b1 || (a1 == b1 && a2 <= b2));
 }                                                   // and triples
 inline bool leq(int a1, int a2, int a3,   int b1, int b2, int b3) {
  return(a1 < b1 || (a1 == b1 && leq(a2,a3, b2,b3)));
 }
 // stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r
 static void radixPass(int* a, int* b, int* r, int n, int K)
 { // count occurrences
  int* c = (int*)bench_alloc(sizeof(int)*(K+1));
  for (int i = 0;  i <= K;  i++) c[i] = 0;         // reset counters
  for (int i = 0;  i < n;  i++) c[r[a[i]]]++;    // count occurences
  for (int i = 0, sum = 0;  i <= K;  i++) { // exclusive prefix sums
     int t = c[i];  c[i] = sum;  sum += t;
  }
  for (int i = 0;  i < n;  i++) b[c[r[a[i]]]++] = a[i];      // sort
 }
 // find the suffix array SA of s[0..n-1] in {1..K}^n
 // require s[n]=s[n+1]=s[n+2]=0, n>=2
 void suffixArray(int* s, int* SA, int n, int K) {
  int n0=(n+2)/3, n1=(n+1)/3, n2=n/3, n02=n0+n2;
  int* s12  = (int*)bench_alloc(sizeof(int)*(n02+3));  s12[n02]= s12[n02+1]= s12[n02+2]=0;
  int* SA12 = (int*)bench_alloc(sizeof(int)*(n02+3)); SA12[n02]=SA12[n02+1]=SA12[n02+2]=0;
  int* s0   = (int*)bench_alloc(sizeof(int)*n0);
  int* SA0  = (int*)bench_alloc(sizeof(int)*n0);
  // generate positions of mod 1 and mod  2 suffixes
  // the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1
  for (int i=0, j=0;  i < n+(n0-n1);  i++) if (i%3 != 0) s12[j++] = i;
  // lsb radix sort the mod 1 and mod 2 triples
  radixPass(s12 , SA12, s+2, n02, K);
  radixPass(SA12, s12 , s+1, n02, K);
  radixPass(s12 , SA12, s  , n02, K);
  // find lexicographic names of triples
  int name = 0, c0 = -1, c1 = -1, c2 = -1;
  for (int i = 0;  i < n02;  i++) {
    if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) {
      name++;  c0 = s[SA12[i]];  c1 = s[SA12[i]+1];  c2 = s[SA12[i]+2];
    }
    if (SA12[i] % 3 == 1) { s12[SA12[i]/3]      = name; } // left half
    else                  { s12[SA12[i]/3 + n0] = name; } // right half
  }
  // recurse if names are not yet unique
  if (name < n02) {
    suffixArray(s12, SA12, n02, name);
    // store unique names in s12 using the suffix array
    for (int i = 0;  i < n02;  i++) s12[SA12[i]] = i + 1;
  } else // generate the suffix array of s12 directly
    for (int i = 0;  i < n02;  i++) SA12[s12[i] - 1] = i;
  // stably sort the mod 0 suffixes from SA12 by their first character
  for (int i=0, j=0;  i < n02;  i++) if (SA12[i] < n0) s0[j++] = 3*SA12[i];
  radixPass(s0, SA0, s, n0, K);
  // merge sorted SA0 suffixes and sorted SA12 suffixes
  for (int p=0,  t=n0-n1,  k=0;  k < n;  k++) {
 #define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2)
    int i = GetI(); // pos of current offset 12 suffix
    int j = SA0[p]; // pos of current offset 0  suffix
    if (SA12[t] < n0 ?
        leq(s[i],       s12[SA12[t] + n0], s[j],       s12[j/3]) :
        leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]))
    { // suffix from SA12 is smaller
      SA[k] = i;  t++;
      if (t == n02) { // done --- only SA0 suffixes left
        for (k++;  p < n0;  p++, k++) SA[k] = SA0[p];
      }
    } else {
      SA[k] = j;  p++;
      if (p == n0)  { // done --- only SA12 suffixes left
        for (k++;  t < n02;  t++, k++) SA[k] = GetI();
      }
    }
  }
 }
 extern "C" {
 static int *s, *sa;
 void bench_ssort_prepare() {
  N = setting->size;
  bench_srand(1);
  s = (int*)bench_alloc(sizeof(int)*(N+10));
  sa = (int*)bench_alloc(sizeof(int)*(N+10));
  for (int i = 0; i < N; i ++) {
    s[i] = bench_rand() % 26;
  }
 }
 void bench_ssort_run() {
  suffixArray(s, sa, N, 26);
 }
 int bench_ssort_validate() {
  return checksum(sa, sa + N) == setting->checksum;
 }
 }
--- a/thread-os/Makefile
+++ b/thread-os/Makefile
@ -0,0 +1,3 @@
 NAME := thread-os
 SRCS := thread-os.c
 include $(AM_HOME)/Makefile
--- a/thread-os/thread-os.c
+++ b/thread-os/thread-os.c
@ -0,0 +1,71 @@
 #include <am.h>
 #include <klib.h>
 #include <klib-macros.h>
 #define MAX_CPU 8
 typedef union task {
  struct {
    const char *name;
    union task *next;
    void      (*entry)(void *);
    Context    *context;
  };
  uint8_t stack[8192];
 } Task;
 Task *currents[MAX_CPU];
 #define current currents[cpu_current()]
 // user-defined tasks
 int locked = 0;
 void lock()   { while (atomic_xchg(&locked, 1)); }
 void unlock() { atomic_xchg(&locked, 0); }
 void func(void *arg) {
  while (1) {
    lock();
    printf("Thread-%s on CPU #%d\n", arg, cpu_current());
    unlock();
    for (int volatile i = 0; i < 100000; i++) ;
  }
 }
 Task tasks[] = {
  { .name = "A", .entry = func },
  { .name = "B", .entry = func },
  { .name = "C", .entry = func },
  { .name = "D", .entry = func },
  { .name = "E", .entry = func },
 };
 // ------------------
 Context *on_interrupt(Event ev, Context *ctx) {
  extern Task tasks[];
  if (!current) current = &tasks[0];
  else          current->context = ctx;
  do {
    current = current->next;
  } while ((current - tasks) % cpu_count() != cpu_current());
  return current->context;
 }
 void mp_entry() {
  iset(true);
  yield();
 }
 int main() {
  ioe_init();
  cte_init(on_interrupt);
  for (int i = 0; i < LENGTH(tasks); i++) {
    Task *task    = &tasks[i];
    Area stack    = (Area) { &task->context + 1, task + 1 };
    task->context = kcontext(stack, task->entry, (void *)task->name);
    task->next    = &tasks[(i + 1) % LENGTH(tasks)];
  }
  mpe_init(mp_entry);
 }