Caching 不同级别缓存中的处理延迟

Caching 不同级别缓存中的处理延迟,caching,latency,Caching,Latency,关于对缓存级别(L1、L2、L3)和RAM的访问时间, 我遇到了一种奇怪的行为,我还没有找到答案, 如果您能帮助我,我们将不胜感激:) 我开始以以下方式填充内存块, 我有不同的块大小作为输入,例如16字节,32字节。。。。256KB, 对于我读取内存的每个特定块,计数并写回。例如,对于1KB,我有256个不同的计数器数组(因为我的计数器是int32,32位=4字节), 我从256个不同的计数器数组的零开始(称之为计数器数组)计数并写回,我做了10000次计数(0~10000),做了10000次,

关于对缓存级别(L1、L2、L3)和RAM的访问时间, 我遇到了一种奇怪的行为,我还没有找到答案, 如果您能帮助我,我们将不胜感激:)

我开始以以下方式填充内存块, 我有不同的块大小作为输入,例如16字节,32字节。。。。256KB, 对于我读取内存的每个特定块,计数并写回。例如,对于1KB,我有256个不同的计数器数组(因为我的计数器是int32,32位=4字节), 我从256个不同的计数器数组的零开始(称之为计数器数组)计数并写回,我做了10000次计数(0~10000),做了10000次,记录了这100次结果,得到平均值并计算处理时间 (按以下代码计算的时间)

计数器\u MAX=10000;
迭代_MAX=100;
//现在,每个核心应该执行的功能是计数器(cnt=cnt+1)
静态整数
lcore\u recv(结构lcore\u参数*p)
{
无符号lcore_id=rte_lcore_id();
printf(“启动核心%u\n”,lcore\u id);
#ifndef EXCEL\u输出
#ifndef直接写入文件
可输出的结构tableEntry[ITERATION_MAX];
#恩迪夫
#恩迪夫
while(可以继续)
{
//printf(“启动核心%u\n”,lcore\u id);
//int index=((lcore\u id-p->baseIndex)-1+CORE\u MAX)%CORE\u MAX;
void*vp;
结构数据*d=p->valueMem;
文件*fp=p->fp;
//fprintf(fp,“迭代%d-------------------\n”,p->Iteration);
//int index=p->index;
结构时间段t1,t2;
对于(int q=0;q值[i]++;
}
}
时钟获取时间(1和2);
处理时间=(t2.tv秒*1e9+t2.tv秒)-(t1.tv秒*1e9+t1.tv秒);/*纳秒*/
//检查每个计数器的最后一个值
int expectedVal=(q+1)*计数器的最大值;
#ifndef EXCEL\u输出
#ifdef直接文件写入
fprintf(fp,“应为:%d\n”,应为val);
#恩迪夫
#恩迪夫
bool-allOk=true;
对于(int i=0;icount;i++)
{
如果(d->value[i]!=expectedVal)
{
如果(allOk)
{
allOk=false;
#ifndef EXCEL\u输出
#ifdef直接文件写入
fprintf(fp,“失败:”);
#恩迪夫
#恩迪夫
}
#ifndef EXCEL\u输出
#ifdef直接文件写入
fprintf(fp,“%d”,i);
#恩迪夫
#恩迪夫
}
}
#ifdef EXCEL_输出
struct tableEntry*entry=&可输出[p->index][p->iteration][q];
进入->分配=分配;
条目->期望值=期望值;
输入->处理时间=处理时间;
#否则
#ifdef直接文件写入
如果(allOk)
{
fprintf(fp,“所有计数器都正常\n”);
}
其他的
{
fprintf(fp,“\n”);
}
fprintf(fp,***时间=%f ns\n”,处理时间);
#否则
struct tableEntry*entry=&可输出[q];
进入->分配=分配;
条目->期望值=期望值;
输入->处理时间=处理时间;
#恩迪夫
#恩迪夫
}
#ifndef EXCEL\u输出
#ifndef直接写入文件
对于(int q=0;q预期值);
如果(输入->分配)
{
fprintf(fp,“所有计数器都正常\n”);
}
其他的
{
fprintf(fp,“失败\n”);
}
fprintf(fp,***时间=%f ns\n),条目->处理时间;
}
#恩迪夫
#恩迪夫
pthread_mutex_lock(&mutexLock_);
processedCount++;
pthread_cond_信号(&readWaitHandle);
pthread_cond_wait(&newIterWaitHandle,&mutexLock_);
pthread_mutex_unlock(&mutexLock_);
}
返回0;
}
所以对于每个区块,我做了相同的测试。例如,如果我有20个不同的测试点(块存储器,如16B、32B等),我将有100行20列的时间矩阵,单位为“ns”。 因此,每列显示不同的块大小,每行显示不同的100测试。 最后我得到了每一列的平均值,并计算了每一列的处理时间,奇怪的行为如下所示, 每当我开始使用像16b这样的小块时,大约在50字节到600字节的间隔,我总是看到这种疯狂的行为,我不知道为什么?(我的第一个问题) 因此,如果继续运行超过2.93MB(大约8MB(LLC大小)/3(同时运行的不同内核),我们将有一个跳跃式跟随) 我的第二个问题是,如果这种跳跃是有意义的,我的意思是btw LLC延迟和RAM延迟之间的差异在2.5或3倍左右是可以的,或者应该更大)

另外,我的系统是核心i7,3.4 Ghz,L1:32 KB,L2:256 KB和L3:8 MB,带有16 GB RAM


提前感谢您的帮助和注意事项

您的测试方法不是通过测试来测量缓存延迟(而且您已打开TubroBoost,因此没有恒定的cpu频率)

缓存的延迟是已知的,并以cpu周期而不是以ns(缓存以cpu核心频率运行)来度量;内存延迟的单位是cycles+ns,因为数据从内存读取后必须通过缓存层次结构(cycles)(ns,内存有自己的时钟)

例如i7-4xxx(Haswell):

英特尔哈斯韦尔

英特尔i7-4770(哈斯韦尔),3.4千兆赫(涡轮增压关闭),22纳米。内存:32 GB(PC3-12800 cl11 cr2)

  • 一级数据缓存延迟=4个周期,用于通过指针进行简单访问
  • 一级数据缓存延迟
        COUNTERS_MAX = 10000;
        ITERATION_MAX = 100;
        // The Function which each core should do, now is counter (cnt = cnt + 1)
    static int
    lcore_recv(struct lcore_params *p)
    {
        unsigned lcore_id = rte_lcore_id();
        printf("Starting core %u\n", lcore_id);
    
    #ifndef EXCEL_OUTPUT
    #ifndef DIRECT_FILE_WRITE
        struct tableEntry outputTable[ITERATION_MAX];
    #endif
    #endif
    
        while(canContinue_)
        {
            //printf("Starting core %u\n", lcore_id);
            //int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
            void * vp;
    
            struct data * d = p->valueMem;
            FILE* fp = p->fp;
    
            //fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
            //int index = p->index;
    
            struct timespec t1, t2;
            for(int q = 0; q < ITERATION_MAX; q++)
            {
                double processTime = 0;
                clock_gettime(1, &t1);
                for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
                {
                    for (int i = 0; i < d->count; i++)
                    {
                        d->value[i]++;
                    }
                }
                clock_gettime(1, &t2);
                processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
    
                //Checks last value of each counter
                int expectedVal = (q + 1) * COUNTERS_MAX;
    
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
                for (int i = 0; i < d->count; i++)
                {
                    if(d->value[i]!=expectedVal)
                    {
                        if(allOk)
                        {
                            allOk = false;
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                            fprintf(fp," Failed : ");
    #endif
    #endif
                        }
    
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                        fprintf(fp,"%d ", i);
    #endif
    #endif
                    }
                }
    
    #ifdef EXCEL_OUTPUT
                struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
    
                entry->allOk=allOk;
                entry->expectedVal=expectedVal;
                entry->processTime=processTime;
    #else
    #ifdef DIRECT_FILE_WRITE
                if(allOk)
                {
                    fprintf(fp,"All counters are ok \n");
                }
                else
                {
                    fprintf(fp,"\n");
                }
                fprintf(fp, "*** Time = %f ns \n", processTime);
    #else
                struct tableEntry* entry= &outputTable[q];
    
                entry->allOk=allOk;
                entry->expectedVal=expectedVal;
                entry->processTime=processTime;
    #endif
    #endif
            }
    
    #ifndef EXCEL_OUTPUT
    #ifndef DIRECT_FILE_WRITE
    
            for(int q = 0; q < ITERATION_MAX; q++)
            {
                struct  tableEntry* entry= &outputTable[q];
                fprintf(fp," Expected : %d\n", entry->expectedVal);
                if(entry->allOk)
                {
                    fprintf(fp,"All counters are ok \n");
                }
                else
                {
                    fprintf(fp,"Failed \n");
                }
                fprintf(fp, "*** Time = %f ns \n", entry->processTime);
    
            }
    #endif
    #endif
            pthread_mutex_lock(&mutexLock_);
            processedCount++;
            pthread_cond_signal(&readWaitHandle);
            pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
            pthread_mutex_unlock(&mutexLock_);
        }
        return 0;
    }
    
           for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
            {
                for (int i = 0; i < d->count; i++)
                {
                    d->value[i]++;
                }
            }
    
    #define ONE p = (char **)*p;
    #define FIVE    ONE ONE ONE ONE ONE
    #define TEN FIVE FIVE
    #define FIFTY   TEN TEN TEN TEN TEN
    #define HUNDRED FIFTY FIFTY
    
    void
    benchmark_loads(iter_t iterations, void *cookie)
    {
        struct mem_state* state = (struct mem_state*)cookie;
        register char **p = (char**)state->p[0];
        register size_t i;
        register size_t count = state->len / (state->line * 100) + 1;
    
        while (iterations-- > 0) {
            for (i = 0; i < count; ++i) {
                HUNDRED;
            }
        }
    
        use_pointer((void *)p);
        state->p[0] = (char*)p;
    }
    
     p = (char**) *p;  // (in intel syntax) == mov eax, [eax]
     p = (char**) *p;
     p = (char**) *p;
     ....   // 100 times total
     p = (char**) *p;
    
     p = (char **)*p;
    
    ifeq ($(RTE_SDK),)
    $(error "Please define RTE_SDK environment variable")
    endif
    
    # Default target, can be overriden by command line or environment
    RTE_TARGET ?= x86_64-native-linuxapp-gcc
    
    include $(RTE_SDK)/mk/rte.vars.mk
    
    # binary name
    APP = Mahdi_test
    
    INC += $(wildcard include/*.h)
    
    # all source are stored in SRCS-y
    SRCS-y := main.c
    
    CFLAGS += $(WERROR_FLAGS) -I -S$(SRCDIR)/include -I/usr/local/include
    
    # Most optimizations are only enabled if an -O level is set on the command line,
    # otherwise they are disabled, even if individual optimization flags are specified.
    # With -O, the compiler tries to reduce code size and execution time, 
    # without performing any optimizations that take a great deal of compilation time.
    # -O3 Optimize yet more. -O3 turns on all optimizations specified by -O2
    
    # EXTRA_CFLAGS += -O3 -S -Wno-error -std=c99
    # After following line do make, go to ./build and run : objdump -d -M intel -S main.o >a.txt
    EXTRA_CFLAGS += -O3 -g -Wno-error -std=c99
    
    # rte.extapp.mk : External application
    include $(RTE_SDK)/mk/rte.extapp.mk
    
    #if __STDC_VERSION__ >= 199901L
    #define _XOPEN_SOURCE 600
    #else
    #define _XOPEN_SOURCE 500
    #endif /* __STDC_VERSION__ */
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    #include <rte_memory.h>
    #include <rte_malloc.h>
    #include <string.h>
    #include <time.h>
    #include <pthread.h>
    #include <rte_ring.h>
    #include <math.h>
    
    #include <stdbool.h>
    #include <sys/types.h>
    
    #define EXCEL_OUTPUT
    
    #ifndef EXCEL_OUTPUT
    #define DIRECT_FILE_WRITE
    #endif
    
    #define CORE_MAX        3
    #define BLOCK_MAX       20     // BKMG = 4, ~ 168.72 MB
    #define COUNTERS_MAX    10000
    #define ITERATION_MAX   100
    #define Factor          1.5
    
    #define BKMG            4
    
    char* testNumber = "23";
    
    
    /*
    uint32_t sizes[BLOCK_MAX] = {   
                                    1*Factor*pow(2, 10)/4, 2*Factor*pow(2, 10)/4, 4*Factor*pow(2, 10)/4, 8*Factor*pow(2, 10)/4, 16*Factor*pow(2, 10)/4, 32*Factor*pow(2, 10)/4, 64*Factor*pow(2, 10)/4, 128*Factor*pow(2, 10)/4, 256*Factor*pow(2, 10)/4, 512*Factor*pow(2, 10)/4,
                                    1*Factor*pow(2, 20)/4, 2*Factor*pow(2, 20)/4, 4*Factor*pow(2, 20)/4, 8*Factor*pow(2, 20)/4, 16*Factor*pow(2, 20)/4, 32*Factor*pow(2, 20)/4, 64*Factor*pow(2, 20)/4, 128*Factor*pow(2, 20)/4, 256*Factor*pow(2, 20)/4, 512*Factor*pow(2, 20)/4,
                                    1*Factor*pow(2, 30)/4, 2*Factor*pow(2, 30)/4
                                };
    */
    uint32_t sizes[BLOCK_MAX] = {   
                                    pow(Factor, 1)*pow(2, BKMG)/4, pow(Factor, 2)*pow(2, BKMG)/4, pow(Factor, 3)*pow(2, BKMG)/4, pow(Factor, 4)*pow(2, BKMG)/4, pow(Factor, 5)*pow(2, BKMG)/4, pow(Factor, 6)*pow(2, BKMG)/4, pow(Factor, 7)*pow(2, BKMG)/4, pow(Factor, 8)*pow(2, BKMG)/4, pow(Factor, 9)*pow(2, BKMG)/4, pow(Factor,10)*pow(2, BKMG)/4,
                                    pow(Factor,11)*pow(2, BKMG)/4, pow(Factor,12)*pow(2, BKMG)/4, pow(Factor,13)*pow(2, BKMG)/4, pow(Factor,14)*pow(2, BKMG)/4, pow(Factor,15)*pow(2, BKMG)/4, pow(Factor,16)*pow(2, BKMG)/4, pow(Factor,17)*pow(2, BKMG)/4, pow(Factor,18)*pow(2, BKMG)/4, pow(Factor,19)*pow(2, BKMG)/4, pow(Factor,20)*pow(2, BKMG)/4,
                                    pow(Factor,21)*pow(2, BKMG)/4, pow(Factor,22)*pow(2, BKMG)/4, pow(Factor,23)*pow(2, BKMG)/4, pow(Factor,24)*pow(2, BKMG)/4, pow(Factor,25)*pow(2, BKMG)/4, pow(Factor,26)*pow(2, BKMG)/4, pow(Factor,27)*pow(2, BKMG)/4, pow(Factor,28)*pow(2, BKMG)/4, pow(Factor,29)*pow(2, BKMG)/4, pow(Factor,30)*pow(2, BKMG)/4,
                                    pow(Factor,31)*pow(2, BKMG)/4, pow(Factor,32)*pow(2, BKMG)/4, pow(Factor,33)*pow(2, BKMG)/4, pow(Factor,34)*pow(2, BKMG)/4, pow(Factor,35)*pow(2, BKMG)/4, pow(Factor,36)*pow(2, BKMG)/4, pow(Factor,37)*pow(2, BKMG)/4, pow(Factor,38)*pow(2, BKMG)/4, pow(Factor,39)*pow(2, BKMG)/4, pow(Factor,40)*pow(2, BKMG)/4,
                                    pow(Factor,41)*pow(2, BKMG)/4, pow(Factor,42)*pow(2, BKMG)/4, pow(Factor,43)*pow(2, BKMG)/4, pow(Factor,44)*pow(2, BKMG)/4, pow(Factor,45)*pow(2, BKMG)/4, pow(Factor,46)*pow(2, BKMG)/4, pow(Factor,47)*pow(2, BKMG)/4, pow(Factor,48)*pow(2, BKMG)/4, pow(Factor,49)*pow(2, BKMG)/4, pow(Factor,50)*pow(2, BKMG)/4,
                                };
    
    /*
    char* names[BLOCK_MAX] = {  
                                "1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
                                "1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M",
                                "1G", "2G"
                             };
    */
    
    char* names[BLOCK_MAX] = {  
                                "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                                "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
                                "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
                                "31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
                                "41", "42", "43", "44", "45", "46", "47", "48", "49", "50",
                             };
    
    // This struct keeps the inoput parameter for each single core (for 3 cores we have 3 of this struct)
    struct lcore_params 
    {
        struct data* valueMem;  // This pointer is the address of one sample of data struct which include the address of memorty related to core and the size of that
        int iteration;          // This keeos the number of main iteratiopn, which block of memory now is processing
        FILE* fp;               // This keeps the handler address of opened file for related core, which via that we could write in mentioned file
        int index;              // This keeps the number of core, here we don't use it anymore
    };
    
    // Keeps the information regarding the memory which allocates to cores
    struct data 
    {
        uint32_t* value;    // This keeps the memory address. This memory is allocated independent for each specific core
        uint32_t count;     // The variable 'count' shows the number of 32-bits taken memory.
    };
    
    struct tableEntry
    {
        int expectedVal;
        double processTime;
        bool allOk;
    };
    
    // This thread variavbles is using for coordination btw cores in order to prevent them interfereing each other while checking readWaitHandle and newIterWaitHandle
    pthread_mutex_t mutexLock_;
    
    // All slave cores wait here till the signal issues(via pthread_cond_signal(&newIterWaitHandle)) from master core in order to start new memory block 
    // Conversely going through newIterWaitHandle goes up here which master core wait till all slave finish their tasks
    pthread_cond_t readWaitHandle, newIterWaitHandle;
    
    bool canContinue_ = true;
    int processedCount = 0;
    
    #ifdef EXCEL_OUTPUT
    //holds all outputs. we save them at the end of work
    struct tableEntry outputTable[CORE_MAX][BLOCK_MAX][ITERATION_MAX];
    
    #endif
    
    // The Function which each core should do, now is counter (cnt = cnt + 1)
    static int
    lcore_recv(struct lcore_params *p)
    {
        unsigned lcore_id = rte_lcore_id();
        printf("Starting core %u\n", lcore_id);
    
    #ifndef EXCEL_OUTPUT
    #ifndef DIRECT_FILE_WRITE
        struct tableEntry outputTable[ITERATION_MAX];
    #endif
    #endif
        
        while(canContinue_)
        {
            //printf("Starting core %u\n", lcore_id);
            //int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
            void * vp;
    
            struct data * d = p->valueMem;
            FILE* fp = p->fp;
    
            //fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
            //int index = p->index;
            struct timespec t1, t2;
            for(int q = 0; q < ITERATION_MAX; q++)
            {
                double processTime = 0;
                // TEST TEST ON
                clock_gettime(1, &t1);
                for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
                {
                    for (int i = 0; i < d->count; i++)
                    {
                        d->value[i]++;
                    }
                }
                clock_gettime(1, &t2);
                processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
                // TEST TEST OFF
                //Checks last value of each counter
                int expectedVal = (q + 1) * COUNTERS_MAX;
    
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
                for (int i = 0; i < d->count; i++)
                {
                    if(d->value[i]!=expectedVal)
                    {
                        if(allOk)
                        {
                            allOk = false;
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                            fprintf(fp," Failed : ");
    #endif
    #endif
                        }
                        
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                        fprintf(fp,"%d ", i);
    #endif
    #endif
                    }
                }
                
    #ifdef EXCEL_OUTPUT
                struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
    
                entry->allOk=allOk;
                entry->expectedVal=expectedVal;
                entry->processTime=processTime;
    #else
    #ifdef DIRECT_FILE_WRITE
                if(allOk)
                {
                    fprintf(fp,"All counters are ok \n");
                }
                else
                {
                    fprintf(fp,"\n");
                }
                fprintf(fp, "*** Time = %f ns \n", processTime);
    #else
                struct tableEntry* entry= &outputTable[q];
    
                entry->allOk=allOk;
                entry->expectedVal=expectedVal;
                entry->processTime=processTime;
    #endif
    #endif
            }
    
    #ifndef EXCEL_OUTPUT
    #ifndef DIRECT_FILE_WRITE
    
            for(int q = 0; q < ITERATION_MAX; q++)
            {
                struct  tableEntry* entry= &outputTable[q];
                fprintf(fp," Expected : %d\n", entry->expectedVal);
                if(entry->allOk)
                {
                    fprintf(fp,"All counters are ok \n");
                }
                else
                {
                    fprintf(fp,"Failed \n");
                }
                fprintf(fp, "*** Time = %f ns \n", entry->processTime);
    
            }
    #endif
    #endif
            pthread_mutex_lock(&mutexLock_);
            processedCount++;
            pthread_cond_signal(&readWaitHandle);
            pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
            pthread_mutex_unlock(&mutexLock_);
        }
        return 0;
    }
    
    // mem_alloc is used in order to release the allocated memory and resize the new memory with new size for it. This function is called for each separate core
    static void 
    mem_alloc(struct data* valueMem, uint32_t newSize, uint32_t iteration)
    {
        valueMem->count = newSize;
        if(valueMem->value)
        {
            rte_free(valueMem->value);
        }
        valueMem->value = (uint32_t *)rte_zmalloc(NULL, sizeof(uint32_t) * newSize, 0);
        if(!valueMem->value)
        {
            printf("Memory Fail\n");
        }
    }
    
    #ifdef EXCEL_OUTPUT
    void saveToExcelFile()
    {
        
        char name[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    
        strcat(name, "output");
        strcat(name, testNumber);
        strcat(name, ".xml");
    
        FILE* fp = fopen(name, "w");
        
        // some setting of excel and xml file
        fprintf(fp,"<?xml version=\"1.0\"?>\n\
        <?mso-application progid=\"Excel.Sheet\"?>\n\
        <Workbook xmlns=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
        xmlns:o=\"urn:schemas-microsoft-com:office:office\"\n\
        xmlns:x=\"urn:schemas-microsoft-com:office:excel\"\n\
        xmlns:ss=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
        xmlns:html=\"http://www.w3.org/TR/REC-html40\">\n\
        <DocumentProperties xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
        <Author>m</Author>\n\
        <LastAuthor>m</LastAuthor>\n\
        <Created>2016-06-11T13:00:49Z</Created>\n\
        <LastSaved>2016-06-11T13:01:30Z</LastSaved>\n\
        <Version>15.00</Version>\n\
        </DocumentProperties>\n\
        <OfficeDocumentSettings xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
        <AllowPNG/>\n\
        </OfficeDocumentSettings>\n\
        <ExcelWorkbook xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
        <WindowHeight>7755</WindowHeight>\n\
        <WindowWidth>20490</WindowWidth>\n\
        <WindowTopX>0</WindowTopX>\n\
        <WindowTopY>0</WindowTopY>\n\
        <ActiveSheet>0</ActiveSheet>\n\
        <ProtectStructure>False</ProtectStructure>\n\
        <ProtectWindows>False</ProtectWindows>\n\
        </ExcelWorkbook>\n\
        <Styles>\n\
        <Style ss:ID=\"Default\" ss:Name=\"Normal\">\n\
        <Alignment ss:Vertical=\"Bottom\"/>\n\
        <Borders/>\n\
        <Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#000000\"/>\n\
        <Interior/>\n\
        <NumberFormat/>\n\
        <Protection/>\n\
        </Style>\n\
        <Style ss:ID=\"s62\">\n\
        <Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#FF0000\"\n\
        ss:Bold=\"1\"/>\n\
        </Style>\n\
        </Styles>\n");
    
        for(int i=0; i < CORE_MAX; i++)
        {
            // starts a worksheet
            fprintf(fp,"<Worksheet ss:Name=\"Sheet%d\">\n\
            <Table ss:ExpandedColumnCount=\"%d\" ss:ExpandedRowCount=\"%d\" x:FullColumns=\"1\"\n\
            x:FullRows=\"1\" ss:DefaultRowHeight=\"15\">\n", i + 1, BLOCK_MAX + 1, ITERATION_MAX + 4);
    
            fprintf(fp, "<Column ss:Width=\"95.25\"/>\n");
            fprintf(fp,"<Row ss:StyleID=\"s62\">\n");
    
            for(int q=0; q < BLOCK_MAX; q++)
            {
                char s[10];
                float f = (float)(pow(Factor,q+1)*pow(2.0, BKMG));
                sprintf(s,"%0.3f", f);
                if(q == 0)
                {
                   fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
                }
                else
                {
                   fprintf(fp,"<Cell><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
                }
            }
            fprintf(fp,"</Row>\n");
            for(int j = 0; j < ITERATION_MAX; j++)
            {
                fprintf(fp,"<Row>\n");
                for(int q = 0; q < BLOCK_MAX; q++)
                {
                    if(q == 0)
                    {
                        fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
                    }
                    else
                    {
                        fprintf(fp,"<Cell><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
                    }
                }
                fprintf(fp,"</Row>\n");
            }
            
            fprintf(fp,"<Row>\n");
            fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Mean</Data></Cell>\n");
            for(int q = 0; q < BLOCK_MAX; q++)
            {      
                fprintf(fp," <Cell ss:Formula=\"=AVERAGE(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -ITERATION_MAX);
            }
            fprintf(fp,"</Row>\n");
        
        
            fprintf(fp,"<Row>\n");
            fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Standard Deviation</Data></Cell>\n");
            for(int q=0; q<BLOCK_MAX; q++)
            {      
                fprintf(fp," <Cell ss:Formula=\"=STDEV(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -(ITERATION_MAX + 1));
            }
            fprintf(fp,"</Row>\n");
            fprintf(fp,"<Row>\n");
            fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Add Latency</Data></Cell>\n");
            for(int q=0; q<BLOCK_MAX; q++)
            {      
                fprintf(fp," <Cell ss:Formula=\"=R[-2]C/(2^4/4)/%d/%f^%d\"><Data ss:Type=\"Number\">0</Data></Cell>\n",COUNTERS_MAX, Factor, q + 1);
            }
            fprintf(fp,"</Row>\n");
    
            //end of worksheet
            fprintf(fp,"</Table>\n</Worksheet>\n");
        }
        //end of file
        fprintf(fp,"</Workbook>");
        fclose(fp);
    }
    #endif
    
    int
    main(int argc, char **argv)
    {
        mkdir("./Resaults", 0777);
    
        int ret;
        unsigned lcore_id;
    
        pthread_attr_t attr;
        pthread_mutex_init(&mutexLock_, NULL);
        pthread_cond_init(&newIterWaitHandle, NULL);
        pthread_cond_init(&readWaitHandle, NULL);
    
        ret = rte_eal_init(argc, argv);
    
        if (ret < 0)
            rte_exit(EXIT_FAILURE, "Cannot init EAL\n");
        struct lcore_params params[CORE_MAX];
        char numT[5];
        sprintf(numT, "%d", CORE_MAX);
    
        for(int i = 0; i < CORE_MAX; i++)
        {
    
    
            // Generates some structures to hold information of assinged job of each core
            struct data* commonMem = (struct data*)rte_malloc(NULL, sizeof(struct data), 0);
    
    
    #ifndef EXCEL_OUTPUT
            char num[5];
            sprintf(num, "%d", i);
    
            char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
            strcat(name3, "./Resaults/");
            strcat(name3, testNumber);
    
            mkdir(name3, 0777);
    
            strcat(name3, "/R");
            strcat(name3, num);
            strcat(name3, "_");
            strcat(name3, numT);
            strcat(name3, "Core");
    
            mkdir(name3, 0777);
    
            char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
            strcat(name2, num);
            strcat(name2, "_");
            strcat(name2, names[0]);
            strcat(name2, ".txt");
    
            strcat(name3, name2);
            params[i].fp = fopen(name3, "w");
    #endif
    
            mem_alloc(commonMem, sizes[0], 0);
            params[i].valueMem = commonMem;
            params[i].index = i;
            params[i].iteration = 0;
            commonMem->value[i] = NULL;
        }
        /*
        printf("sleep ...\n");
        for(int f=0;f<4; f++)
        {
          sleep(1);
        }
        */
        /*
        double p=0;
        for(double f=0;f<1e9; f+=0.3)
        {
          p+=0.1;
        }*/
        
        printf("Starting lcores ...\n");
        printf("RTE_MAX_LCORE = %d\n", RTE_MAX_LCORE);
    
        lcore_id = rte_get_next_lcore(-1, 1, 0);
    
        processedCount = 0;
    
        // Ask each core do the funtion lcore_recv
        for(int i = 0; i < CORE_MAX; i++)
        {
            rte_eal_remote_launch((lcore_function_t*)lcore_recv, &params[i], lcore_id);
            lcore_id = rte_get_next_lcore(lcore_id, 0, 1);
        }
        // For each core do the function for "BLOCK_MAX" times
        for(int j = 1; j <= BLOCK_MAX; j++)
        {
            printf("Iteration : %d\n", j);
            pthread_mutex_lock(&mutexLock_);
            while(processedCount < CORE_MAX)
            {
                pthread_cond_wait(&readWaitHandle, &mutexLock_);
            }
    
            for(int i = 0; i < CORE_MAX; i++)
            {
    #ifndef EXCEL_OUTPUT
                fclose(params[i].fp);
                if(j < BLOCK_MAX)
                {
                    char num[5];
                    sprintf(num, "%d", i);
                    char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
                    strcat(name3, "./Resaults/");
                    strcat(name3, testNumber);
                    mkdir(name3, 0777);
                    strcat(name3, "/R");
                    strcat(name3, num);
                    strcat(name3, "_");
                    strcat(name3, numT);
                    strcat(name3, "Core");
                    mem_alloc( params[i].valueMem, sizes[j], j);
                    char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
                    strcat(name2, num);
                    strcat(name2, "_");
                    strcat(name2, names[j]);
                    strcat(name2, ".txt");
                    strcat(name3, name2);
                    params[i].fp = fopen(name3,"w");
                    params[i].iteration = j;
                }
    #else
                mem_alloc( params[i].valueMem, sizes[j], j);
                params[i].iteration = j;
    #endif
    
            }
    
            if(j < BLOCK_MAX)
            {
                printf("%d : New Data Added ----------\n", j);
            }
            else
            {
                canContinue_ = false;
            }
            //Signal cores in order to start new iteration
            processedCount = 0;
            for(int i = 0; i < CORE_MAX; i++)
            {
                pthread_cond_signal(&newIterWaitHandle);
            }
            pthread_mutex_unlock(&mutexLock_);
        }
    
        printf("Waiting for lcores to finish ...\n");
    
    #ifdef EXCEL_OUTPUT
        saveToExcelFile();
    #endif
        rte_eal_mp_wait_lcore();
        return 0;
    }
    
                           // TEST TEST ON
                clock_gettime(1, &t1);
      47:   48 89 e6                mov    rsi,rsp
      4a:   bf 01 00 00 00          mov    edi,0x1
      4f:   e8 00 00 00 00          call   54 <lcore_recv+0x54>
      54:   8b 4b 08                mov    ecx,DWORD PTR [rbx+0x8]
      57:   be 10 27 00 00          mov    esi,0x2710
      5c:   0f 1f 40 00             nop    DWORD PTR [rax+0x0]
                for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
                {
                    for (int i = 0; i < d->count; i++)
      60:   85 c9                   test   ecx,ecx
      62:   74 1d                   je     81 <lcore_recv+0x81>
      64:   48 8b 03                mov    rax,QWORD PTR [rbx]
      67:   31 d2                   xor    edx,edx
      69:   0f 1f 80 00 00 00 00    nop    DWORD PTR [rax+0x0]
                    {
                        d->value[i]++;
      70:   83 00 01                add    DWORD PTR [rax],0x1
                double processTime = 0;
                // TEST TEST ON
                clock_gettime(1, &t1);
                for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
                {
                    for (int i = 0; i < d->count; i++)
      73:   83 c2 01                add    edx,0x1
      76:   48 83 c0 04             add    rax,0x4
      7a:   8b 4b 08                mov    ecx,DWORD PTR [rbx+0x8]
      7d:   39 ca                   cmp    edx,ecx
      7f:   72 ef                   jb     70 <lcore_recv+0x70>
            for(int q = 0; q < ITERATION_MAX; q++)
            {
                double processTime = 0;
                // TEST TEST ON
                clock_gettime(1, &t1);
                for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
      81:   83 ee 01                sub    esi,0x1
      84:   75 da                   jne    60 <lcore_recv+0x60>
                    for (int i = 0; i < d->count; i++)
                    {
                        d->value[i]++;
                    }
                }
                clock_gettime(1, &t2);
      86:   48 8d 74 24 10          lea    rsi,[rsp+0x10]
      8b:   bf 01 00 00 00          mov    edi,0x1
      90:   e8 00 00 00 00          call   95 <lcore_recv+0x95>
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
                for (int i = 0; i < d->count; i++)
      95:   8b 4b 08                mov    ecx,DWORD PTR [rbx+0x8]
                clock_gettime(1, &t2);
                processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
                // TEST TEST OFF
    
                //Checks last value of each counter
                int expectedVal = (q + 1) * COUNTERS_MAX;
      98:   41 8d 7c 24 01          lea    edi,[r12+0x1]
                    {
                        d->value[i]++;
                    }
                }
                clock_gettime(1, &t2);
                processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
      9d:   c4 e1 f3 2a 4c 24 10    vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x10]
      a4:   c4 e1 eb 2a 14 24       vcvtsi2sd xmm2,xmm2,QWORD PTR [rsp]
      aa:   c4 e1 fb 2a 44 24 18    vcvtsi2sd xmm0,xmm0,QWORD PTR [rsp+0x18]
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
                for (int i = 0; i < d->count; i++)
      b1:   85 c9                   test   ecx,ecx
                    {
                        d->value[i]++;
                    }
                }
                clock_gettime(1, &t2);
                processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
      b3:   c5 f3 59 0d 00 00 00    vmulsd xmm1,xmm1,QWORD PTR [rip+0x0]        # bb <lcore_recv+0xbb>
      ba:   00 
      bb:   c5 eb 59 15 00 00 00    vmulsd xmm2,xmm2,QWORD PTR [rip+0x0]        # c3 <lcore_recv+0xc3>
      c2:   00 
      c3:   c5 f3 58 d8             vaddsd xmm3,xmm1,xmm0
      c7:   c4 e1 f3 2a 4c 24 08    vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x8]
      ce:   c5 eb 58 c1             vaddsd xmm0,xmm2,xmm1
      d2:   c5 e3 5c c0             vsubsd xmm0,xmm3,xmm0
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
                for (int i = 0; i < d->count; i++)
      d6:   74 6a                   je     142 <lcore_recv+0x142>
      d8:   48 8b 33                mov    rsi,QWORD PTR [rbx]
      db:   31 c0                   xor    eax,eax
    #ifndef EXCEL_OUTPUT
    #ifdef DIRECT_FILE_WRITE
                fprintf(fp," Expected : %d\n", expectedVal);
    #endif
    #endif
                bool allOk = true;
      dd:   ba 01 00 00 00          mov    edx,0x1
      e2:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]
      e8:   44 39 2c 86             cmp    DWORD PTR [rsi+rax*4],r13d
      ec:   41 0f 45 d6             cmovne edx,r14d
      f0:   48 83 c0 01             add    rax,0x1
                for (int i = 0; i < d->count; i++)
      f4:   39 c1                   cmp    ecx,eax
      f6:   77 f0                   ja     e8 <lcore_recv+0xe8>
    #endif
                    }
                }