Assembly 装配中的矩阵乘法_Assembly_X86_Masm_Matrix Multiplication

Assembly 装配中的矩阵乘法

assembly x86

Assembly 装配中的矩阵乘法,assembly,x86,masm,matrix-multiplication,Assembly,X86,Masm,Matrix Multiplication,我正在用汇编语言写一些矩阵乘法的代码。我不能使用变量，只能在堆栈上存储我需要的内容。该算法似乎工作正常，但在最后两段代码中，IMUL和MOV使用寄存器时存在问题。我在这里发布我的代码： unsigned int m = 3; // raws of mat1 unsigned int n = 2; // columns of mat1 unsigned int k = 4; // columns of mat2 short int m

我正在用汇编语言写一些矩阵乘法的代码。我不能使用变量，只能在堆栈上存储我需要的内容。该算法似乎工作正常，但在最后两段代码中，IMUL和MOV使用寄存器时存在问题。我在这里发布我的代码：

        unsigned int m = 3; // raws of mat1
        unsigned int n = 2; // columns of mat1
        unsigned int k = 4; // columns of mat2
        short int mat1[] = { -1,-2, 4,5, 4,-2 }; // first matrix
        short int mat2[] = { 2,0,4,6, 0,2,-1,3 }; // second matrix
        int mat3[1024]; // output matrix

        __asm {

            XOR EAX, EAX    //mat1 raws counter
            XOR EBX, EBX    //mat2 columns counter
            XOR EDX, EDX    //mat1 columns(equal to mat2 raws) counter
            XOR EDI, EDI    //will contain sum of multiplications to be copied into output matrix

            Loop1 :         //determinates the raws of output matrix: mat3
            XOR EBX, EBX    //at the end of first raw, column counter is resetted
                CMP m, EAX  //if loopped mat1 m-raws times...       
                JZ Finale   //...algortihm is over
                INC EAX     //increase mat1 raws counter 
                JMP Loop2

            Loop2 :             //determinates the columns of mat3
            XOR EDX, EDX        //at the end of the n-sums, mat1 column counter is resetted
                XOR EDI, EDI    //after sum of n-multiplications edi is resetted
                CMP k, EBX      //if multiplications/sums on this raw have been done...
                JZ Loop1        //...go to next output matrix raw
                INC EBX         //increase mat2 columns counter
                JMP Loop3

            Loop3 :         //determinates elements of mat3
            CMP n, EDX      //if the n-multiplacations/sums on first n-elements have been done...
                JZ Loop2    //...skip to next n-elements
                INC EDX     //increase counter of the elements that will be multiplicate
                JMP Stuffs  //go to operations code block

            Stuffs :                                        //here code generates mat3 elements
    #58     MOV SI, mat1[2 * ((EAX - 1) * 2 + (EDX - 1)]    //moves to SI the [m-raws/n-clomumn] element of mat1
    #59         IMUL SI, mat2[2 * ((EBX - 1) * 2 + (EDX - 1)]   //multiplicates(with sign) SI and [n-raws/k-column] element of mat2
                ADD DI, SI                                  //sum the result in edi
                CMP n, EDX                                  //check the sums
                JZ CopyResult                               //if n-sums have been done...
                JMP Loop3                                   //...go to copy result into mat3

            CopyResult :
    #66     MOV mat3[4 * ((EAX - 1) * 4 + (EBX - 1))], EDI  //copy value to output matrix mat3
                JMP Loop3                                   //go to next n-elements

            Finale :
    }

    {
        unsigned int i, j, h;

        printf("Output matrix:\n");
        for (i = h = 0; i < m; i++) {
            for (j = 0; j < k; j++, h++)
                printf("%6d ", mat3[h]);
            printf("\n");
        }

    }

unsigned int m=3；//mat1的raws
无符号整数n=2；//mat1列
无符号整数k=4；//mat2列
短int mat1[]={-1，-2,4,5,4，-2}；//第一矩阵
短int mat2[]={2,0,4,6,0,2，-1,3}；//第二矩阵
int mat3[1024]；//输出矩阵
__asm{
异或EAX，EAX//mat1 raws计数器
异或EBX，EBX//mat2列计数器
XOR EDX，EDX//mat1列（等于mat2 raws）计数器
XOR EDI，EDI//将包含要复制到输出矩阵中的乘法和
Loop1://确定输出矩阵mat3的RAW
XOR EBX，EBX//在第一个原始列结束时，列计数器被重置
CMP m，EAX//如果循环mat1 m-raws时间。。。
JZ结局//…算法结束了
INC EAX//增加mat1 raws计数器
JMP环2
Loop2://确定mat3的列
XOR EDX，EDX//在n-和的末尾，重置mat1列计数器
XOR EDI，EDI//重置n次乘法和EDI后
CMP k，EBX//如果对该原始数据进行了乘法/求和。。。
JZ Loop1/…转到下一个输出矩阵raw
INC EBX//增加mat2列计数器
JMP Loop3
Loop3://确定mat3的元素
CMP n，EDX//如果前n个元素上的n次乘法/和已完成。。。
JZ Loop2/…跳到下一个n元素
INC EDX//增加将被乘法的元素的计数器
JMP Stuff//转到操作代码块
Stuff://此处代码生成mat3元素
#58 MOV SI，mat1[2*（（EAX-1）*2+（EDX-1）]//移动到SI mat1的[m-raws/n-clomumn]元素
#59 IMUL SI，mat2[2*（（EBX-1）*2+（EDX-1）]//乘法（带符号）SI和mat2的[n-raws/k-column]元素
添加DI，SI//在edi中对结果求和
CMP n，EDX//检查总和
JZ CopyResult//如果已完成n个求和。。。
JMP Loop3/…转到将结果复制到mat3
复制结果：
#66 MOV mat3[4*（（EAX-1）*4+（EBX-1））]，EDI//将值复制到输出矩阵mat3
JMP Loop3//转到下一个n元素
结局：
}
{
无符号整数i，j，h；
printf（“输出矩阵：\n”）；
对于（i=h=0；i


在这段代码中，编译器报告了两种类型的错误，分别引用mat1、mat2和mat3的IMUL和MOV。它们是：

第58行-错误C2404“EDX”：第二个操作数中的寄存器非法
第58行-错误C2430“第二个操作数”中存在多个索引寄存器

对于第59行和第66行，EDX和EBX寄存器存在相同的错误
这个算法基本上好吗？（我手动测试了一些设置。）
索引，然后是最后一个，在调试期间，它很好，但我不能完全测试它）
我认为第一个错误取决于第二个错误，但是如果我
无法以这种方式使用寄存器，如何计算输出？
不要尝试在寻址模式下将多个寄存器缩放两倍（），只需使用add eax，2
而不是inc eax

此外，由于您的输出矩阵使用32位int
，因此您应该进行32位数学运算。您在DI中生成一个值，然后将该值加上EDI高半部的所有垃圾存储在第66行
差不多
/

movsx-eax，单词ptr[列+行中的偏移量]/
imul eax，esi
可能适用于内部循环体的（部分）。我将让您在第一种寻址模式下按列递增，在第二种寻址模式下按行递增

我认为你的算法可能是合理的，基于我认为你正在尝试做的。对于输出矩阵的每个元素，在一个矩阵中循环一列，在另一个矩阵中循环一行。所以你只对输出矩阵的每个元素存储一次。不管你的循环是否真的这样做，IDK：分支有多难看，这会伤害我的大脑是。（有时看看优化编译器输出的循环，然后是双嵌套或三嵌套循环。例如on）

嵌套循环的其他方法对于大型矩阵的性能可能更好或更差，但唯一真正好的方法（对于大型矩阵）是转置其中一个输入矩阵，以便可以同时在两个矩阵中的连续内存元素上循环（因为转置需要O（n^2）时间，但会加快O（n^3）重复遍历转置数组的步骤，因为它会提供更多缓存命中）
（考虑到浮点matmul在科学计算中是多么常见，这是一个经过广泛研究的主题，在代码的实验调优方面投入了大量精力。请参阅BLAS中DGEMM函数的各种实现。）
而不是试图在寻址模式下将多个寄存器缩放两倍（），只需使用addeax，2
而不是inceax

此外，由于输出矩阵使用32位int，
void main()
{
    unsigned int m = 3; // numero di righe della prima matrice
    unsigned int n = 2; // numero di colonne della prima matrice
    unsigned int k = 4; // numero di colonne della seconda matrice
    short int mat1[] = { -1,-2, 4,5, 4,-2 }; // prima matrice
    short int mat2[] = { 2,0,0,0, 0,2,0,0 }; // seconda matrice
    int mat3[1024]; // matrice risultato

    __asm {

        lea eax, mat1       //load mat1
        lea edi, mat3       //load mat result
        push m

        Loop3 :             //extern loop
        lea ebx, mat2       //load here mat2 to start from the beginning when new result raw starts
            xor edx, edx    //sets to zero column counter set to zero when new result row starts

        Loop2 :             //middle loop, as long as k, mat2 columns 
        xor ecx, ecx        //sets to zero mat1 column counter every n multiplications

        Loop1 :             //inner loop
        call Compute        //calls sub program that calulates raw/column products
            inc ecx         //increase column counter
            cmp ecx, n      //check column counter
            jb Loop1        //if below loop1 again
            add ebx, 2      //if equal to n, inner loop is over, move mat2 position of one position
            inc edx         //increase mat2 column counter
            cmp edx, k      //chek mat2 column counter
            jb Loop2        //if below loop2 again
            imul esi, n, 2      //else calculate offset to skip to new raw in mat1
            add eax, esi        //...skip to new mat1 raw
            imul esi, k, 4      //calculate offset to skip to new raw in result matrix(mat3)
            add edi, esi        //...skip to new raw in mat3
            dec m               //a raw in mat1 has been done, decrease its counter
            cmp m, 0            //check how many raws has been done
            ja Loop3            //if more than zero, do extern loop again
            jmp Finale          //else algorithm is over

        Compute :               //calulates raw/column products
        movsx esi, WORD PTR[eax][ecx * 2]       
            push edi            //stores mat3 address to free edi counter
            push ecx            //stores the actual value of column counter to free ecx register
            mov edi, k          //calculates the offset in mat2...
            imul edi, ecx       //...
            movsx ecx, WORD PTR[ebx][edi * 2]   //mov the value of mat2 to ecx
            imul esi, ecx       //multiplicates values of mat1 ad mat2
            pop ecx             //set back column counter
            pop edi             //set back mat3 address
            cmp ecx, 0          //if ecx is zero...
            je First            //...is the first multiplication for this result value...
            add[edi][edx * 4], esi  //if not the first, add the value to current position

        Back :
        ret                     //in any case, comes back to loops...

        First :                 //...so move here the first value to which add the others
        mov[edi][edx * 4], esi  //moving value
            jmp Back

        Finale :        //the end
        pop m           //restore the original mat1 raw value to print the result matrix below

    }

    //Output on video

    {
        unsigned int i, j, h;

        printf("Product Matrix:\n");
        for (i = h = 0; i < m; i++) {
            for (j = 0; j < k; j++, h++)
                printf("%6d ", mat3[h]);
            printf("\n");
        }

    }
}

m = raws of first matrix
k = columns of second matrix
n = column of first matrix and raws of second matrix

x=0
Loop3:
y=0
Loop2:
z=0
Loop1:
compute...
z++
if(z<n)
   go to Loop1
y++
if(y < k)
   go to Loop2
x++
if(x < m)
   go to Loop3
Else go to the End