# Assignment2: GNU Toolchain contributed by < [TRChen11011](https://github.com/TRChen11011/GNU-Toolchain) > ## Install VMware and Ubuntu I use ==Unbuntu Linux 22.04== on VMware Because using clz to implement multification is cool. ## Instr * Setting the environment var > cd $HOME > source riscv-none-elf-gcc/setenv * make > make * Run code > build/rv32emu test.elf * Display the assembler mnemonics for the machine instructions and store to .txt file > riscv-none-elf-objdump -d test > test.txt * Check elf size > riscv-none-elf-size ./test ## Before we modify the code we use perfcount to implement CSR. CSR is use to count the cycle of the code. **perfcount/main.c** ```c= #include <stdint.h> #include <stdio.h> #include <string.h> //those are functions in the same dir in perfcounter used to take CSR extern uint64_t get_cycles(); extern uint64_t get_instret(); int main(void) { /* measure cycles */ uint64_t instret = get_instret(); uint64_t oldcount = get_cycles(); /* fill the C code you choose here, so you can get the CSR*/ uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); printf("instret: %x\n", (unsigned) (instret & 0xffffffff)); return 0; } ``` And we have to modify Makefile ```= .PHONY: clean include ../../mk/toolchain.mk /*-Ofast , -O1 , -O2 ... you want need to chane in Makefile*/ CFLAGS = -march=rv32i_zicsr_zifencei -mabi=ilp32 -O1 -Wall OBJS = \ /*if you want add file, fill in*/ getcycles.o \ getinstret.o \ main.o BIN = perfcount.elf /*the final result you generate*/ %.o: %.S $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $< %.o: %.c $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $< all: $(BIN) $(BIN): $(OBJS) $(CROSS_COMPILE)gcc -o $@ $^ clean: $(RM) $(BIN) $(OBJS) ``` ## Choosen Project I choose the project form < [陳彥佑](https://hackmd.io/ZlqmPMmHQieGU7-esbziRw?both) > The project is to use clz to implement a 32bits * 32bits multiplier. ### C code after modify to CSR ```c= #include <stdint.h> #include <stdio.h> #include <string.h> #include <inttypes.h> extern uint64_t get_cycles(); extern uint64_t get_instret(); uint16_t CLZ_32(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x3f)); } uint64_t efficient_int_mul(uint32_t A, uint32_t B) { uint16_t n = CLZ_32(A); uint16_t m = CLZ_32(B); uint16_t result_bits; if(n>m) result_bits = n; else{ result_bits = m; uint32_t temp = A; A = B; B = temp; } uint64_t result = 0; for (int i = 0; i < 32-result_bits; i++) { if ((A >> i) & 1) { result += ((uint64_t)B << i); } } return result; } int main(void) { unsigned int state[WORDS] = {0}; /* measure cycles */ uint64_t instret = get_instret(); uint64_t oldcount = get_cycles(); uint32_t A = 0x12345678; uint32_t B = 0xffffdddd; uint64_t result = efficient_int_mul(A, B); uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); printf("instret: %x\n", (unsigned) (instret & 0xffffffff)); printf("uint64: %"PRIX64"\n", result); return 0; } ``` ## Assembly code ### -O1 Assembly code ```asm= 0001016c <CLZ_32>: 1016c: 00155793 srl a5,a0,0x1 10170: 00a7e533 or a0,a5,a0 10174: 00255793 srl a5,a0,0x2 10178: 00a7e7b3 or a5,a5,a0 1017c: 0047d513 srl a0,a5,0x4 10180: 00f56533 or a0,a0,a5 10184: 00855713 srl a4,a0,0x8 10188: 00a76733 or a4,a4,a0 1018c: 01075793 srl a5,a4,0x10 10190: 00e7e7b3 or a5,a5,a4 10194: 0017d713 srl a4,a5,0x1 10198: 555556b7 lui a3,0x55555 1019c: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 101a0: 00d77733 and a4,a4,a3 101a4: 40e787b3 sub a5,a5,a4 101a8: 0027d713 srl a4,a5,0x2 101ac: 333336b7 lui a3,0x33333 101b0: 33368693 add a3,a3,819 # 33333333 <__BSS_END__+0x333155e3> 101b4: 00d77733 and a4,a4,a3 101b8: 00d7f7b3 and a5,a5,a3 101bc: 00f70733 add a4,a4,a5 101c0: 00475793 srl a5,a4,0x4 101c4: 00e787b3 add a5,a5,a4 101c8: 0f0f1737 lui a4,0xf0f1 101cc: f0f70713 add a4,a4,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 101d0: 00e7f7b3 and a5,a5,a4 101d4: 0087d713 srl a4,a5,0x8 101d8: 00f70733 add a4,a4,a5 101dc: 01075793 srl a5,a4,0x10 101e0: 00e787b3 add a5,a5,a4 101e4: 03f7f793 and a5,a5,63 101e8: 02000513 li a0,32 101ec: 40f50533 sub a0,a0,a5 101f0: 01051513 sll a0,a0,0x10 101f4: 01055513 srl a0,a0,0x10 101f8: 00008067 ret 000101fc <efficient_int_mul>: 101fc: ff010113 add sp,sp,-16 10200: 00112623 sw ra,12(sp) 10204: 00812423 sw s0,8(sp) 10208: 00912223 sw s1,4(sp) 1020c: 01212023 sw s2,0(sp) 10210: 00050493 mv s1,a0 10214: 00058413 mv s0,a1 10218: f55ff0ef jal 1016c <CLZ_32> 1021c: 00050913 mv s2,a0 10220: 00040513 mv a0,s0 10224: f49ff0ef jal 1016c <CLZ_32> 10228: 01257a63 bgeu a0,s2,1023c <efficient_int_mul+0x40> 1022c: 00090513 mv a0,s2 10230: 00040793 mv a5,s0 10234: 00048413 mv s0,s1 10238: 00078493 mv s1,a5 1023c: 02000613 li a2,32 10240: 40a60633 sub a2,a2,a0 10244: 06c05463 blez a2,102ac <efficient_int_mul+0xb0> 10248: 00000793 li a5,0 1024c: 00000513 li a0,0 10250: 00000593 li a1,0 10254: 0014de13 srl t3,s1,0x1 10258: 01f00313 li t1,31 1025c: 00000893 li a7,0 10260: 02c0006f j 1028c <efficient_int_mul+0x90> 10264: 40f30733 sub a4,t1,a5 10268: 00ee5733 srl a4,t3,a4 1026c: 00f496b3 sll a3,s1,a5 10270: 00d506b3 add a3,a0,a3 10274: 00a6b833 sltu a6,a3,a0 10278: 00e585b3 add a1,a1,a4 1027c: 00068513 mv a0,a3 10280: 00b805b3 add a1,a6,a1 10284: 00178793 add a5,a5,1 10288: 02c78663 beq a5,a2,102b4 <efficient_int_mul+0xb8> 1028c: 00f45733 srl a4,s0,a5 10290: 00177713 and a4,a4,1 10294: fe0708e3 beqz a4,10284 <efficient_int_mul+0x88> 10298: fe078713 add a4,a5,-32 1029c: fc0744e3 bltz a4,10264 <efficient_int_mul+0x68> 102a0: 00e49733 sll a4,s1,a4 102a4: 00088693 mv a3,a7 102a8: fc9ff06f j 10270 <efficient_int_mul+0x74> 102ac: 00000513 li a0,0 102b0: 00000593 li a1,0 102b4: 00c12083 lw ra,12(sp) 102b8: 00812403 lw s0,8(sp) 102bc: 00412483 lw s1,4(sp) 102c0: 00012903 lw s2,0(sp) 102c4: 01010113 add sp,sp,16 102c8: 00008067 ret 000102cc <main>: 102cc: fe010113 add sp,sp,-32 102d0: 00112e23 sw ra,28(sp) 102d4: 00812c23 sw s0,24(sp) 102d8: 00912a23 sw s1,20(sp) 102dc: 01212823 sw s2,16(sp) 102e0: 01312623 sw s3,12(sp) 102e4: e75ff0ef jal 10158 <get_instret> 102e8: 00050913 mv s2,a0 102ec: e59ff0ef jal 10144 <get_cycles> 102f0: 00050993 mv s3,a0 102f4: ffffe5b7 lui a1,0xffffe 102f8: ddd58593 add a1,a1,-547 # ffffdddd <__BSS_END__+0xfffe008d> 102fc: 12345537 lui a0,0x12345 10300: 67850513 add a0,a0,1656 # 12345678 <__BSS_END__+0x12327928> 10304: ef9ff0ef jal 101fc <efficient_int_mul> 10308: 00050493 mv s1,a0 1030c: 00058413 mv s0,a1 10310: e35ff0ef jal 10144 <get_cycles> 10314: 413505b3 sub a1,a0,s3 10318: 0001c537 lui a0,0x1c 1031c: bf050513 add a0,a0,-1040 # 1bbf0 <__clzsi2+0x6e> 10320: 424000ef jal 10744 <printf> 10324: 00090593 mv a1,s2 10328: 0001c537 lui a0,0x1c 1032c: c0450513 add a0,a0,-1020 # 1bc04 <__clzsi2+0x82> 10330: 414000ef jal 10744 <printf> 10334: 00048613 mv a2,s1 10338: 00040693 mv a3,s0 1033c: 0001c537 lui a0,0x1c 10340: c1450513 add a0,a0,-1004 # 1bc14 <__clzsi2+0x92> 10344: 400000ef jal 10744 <printf> 10348: 00000513 li a0,0 1034c: 01c12083 lw ra,28(sp) 10350: 01812403 lw s0,24(sp) 10354: 01412483 lw s1,20(sp) 10358: 01012903 lw s2,16(sp) 1035c: 00c12983 lw s3,12(sp) 10360: 02010113 add sp,sp,32 10364: 00008067 ret ``` #### elf size ![](https://hackmd.io/_uploads/Skz1R90bp.png) #### execute ![](https://hackmd.io/_uploads/SkLFp5CZa.png) - observe - Total cycle count : `393` - Total instruction count : `2c7` - Register use : `$ra, $sp, $a0~$a7, $s0~$s3, $t0,%t1` ### -O2 Assembly code ```asm= 00010208 <CLZ_32>: 10208: 00155793 srl a5,a0,0x1 1020c: 00a7e533 or a0,a5,a0 10210: 00255793 srl a5,a0,0x2 10214: 00a7e7b3 or a5,a5,a0 10218: 0047d513 srl a0,a5,0x4 1021c: 00f56533 or a0,a0,a5 10220: 00855713 srl a4,a0,0x8 10224: 00a76733 or a4,a4,a0 10228: 01075793 srl a5,a4,0x10 1022c: 00e7e7b3 or a5,a5,a4 10230: 555556b7 lui a3,0x55555 10234: 0017d713 srl a4,a5,0x1 10238: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 1023c: 00d77733 and a4,a4,a3 10240: 40e787b3 sub a5,a5,a4 10244: 333336b7 lui a3,0x33333 10248: 33368693 add a3,a3,819 # 33333333 <__BSS_END__+0x333155e3> 1024c: 0027d713 srl a4,a5,0x2 10250: 00d77733 and a4,a4,a3 10254: 00d7f7b3 and a5,a5,a3 10258: 00f70733 add a4,a4,a5 1025c: 00475793 srl a5,a4,0x4 10260: 0f0f16b7 lui a3,0xf0f1 10264: 00e787b3 add a5,a5,a4 10268: f0f68693 add a3,a3,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 1026c: 00d7f7b3 and a5,a5,a3 10270: 0087d713 srl a4,a5,0x8 10274: 00f70733 add a4,a4,a5 10278: 01075793 srl a5,a4,0x10 1027c: 00e787b3 add a5,a5,a4 10280: 03f7f793 and a5,a5,63 10284: 02000513 li a0,32 10288: 40f50533 sub a0,a0,a5 1028c: 01051513 sll a0,a0,0x10 10290: 01055513 srl a0,a0,0x10 10294: 00008067 ret 00010298 <efficient_int_mul>: 10298: ff010113 add sp,sp,-16 1029c: 00812423 sw s0,8(sp) 102a0: 00912223 sw s1,4(sp) 102a4: 01212023 sw s2,0(sp) 102a8: 00112623 sw ra,12(sp) 102ac: 00058413 mv s0,a1 102b0: 00050493 mv s1,a0 102b4: f55ff0ef jal 10208 <CLZ_32> 102b8: 00050913 mv s2,a0 102bc: 00040513 mv a0,s0 102c0: f49ff0ef jal 10208 <CLZ_32> 102c4: 01257a63 bgeu a0,s2,102d8 <efficient_int_mul+0x40> 102c8: 00040793 mv a5,s0 102cc: 00090513 mv a0,s2 102d0: 00048413 mv s0,s1 102d4: 00078493 mv s1,a5 102d8: 02000613 li a2,32 102dc: 40a60633 sub a2,a2,a0 102e0: 06c05263 blez a2,10344 <efficient_int_mul+0xac> 102e4: 00000793 li a5,0 102e8: 00000513 li a0,0 102ec: 00000593 li a1,0 102f0: 0014d313 srl t1,s1,0x1 102f4: 01f00893 li a7,31 102f8: 0280006f j 10320 <efficient_int_mul+0x88> 102fc: 00d496b3 sll a3,s1,a3 10300: 00000713 li a4,0 10304: 00e50733 add a4,a0,a4 10308: 00a73833 sltu a6,a4,a0 1030c: 00d585b3 add a1,a1,a3 10310: 00070513 mv a0,a4 10314: 00b805b3 add a1,a6,a1 10318: 00178793 add a5,a5,1 1031c: 02f60863 beq a2,a5,1034c <efficient_int_mul+0xb4> 10320: 00f45733 srl a4,s0,a5 10324: 00177713 and a4,a4,1 10328: fe078693 add a3,a5,-32 1032c: fe0706e3 beqz a4,10318 <efficient_int_mul+0x80> 10330: 40f88833 sub a6,a7,a5 10334: fc06d4e3 bgez a3,102fc <efficient_int_mul+0x64> 10338: 00f49733 sll a4,s1,a5 1033c: 010356b3 srl a3,t1,a6 10340: fc5ff06f j 10304 <efficient_int_mul+0x6c> 10344: 00000513 li a0,0 10348: 00000593 li a1,0 1034c: 00c12083 lw ra,12(sp) 10350: 00812403 lw s0,8(sp) 10354: 00412483 lw s1,4(sp) 10358: 00012903 lw s2,0(sp) 1035c: 01010113 add sp,sp,16 10360: 00008067 ret 000100b0 <main>: 100b0: fe010113 add sp,sp,-32 100b4: 00112e23 sw ra,28(sp) 100b8: 00812c23 sw s0,24(sp) 100bc: 00912a23 sw s1,20(sp) 100c0: 01212823 sw s2,16(sp) 100c4: 01312623 sw s3,12(sp) 100c8: 12c000ef jal 101f4 <get_instret> 100cc: 00050913 mv s2,a0 100d0: 110000ef jal 101e0 <get_cycles> 100d4: 00050993 mv s3,a0 100d8: ffffe5b7 lui a1,0xffffe 100dc: 12345537 lui a0,0x12345 100e0: ddd58593 add a1,a1,-547 # ffffdddd <__BSS_END__+0xfffe008d> 100e4: 67850513 add a0,a0,1656 # 12345678 <__BSS_END__+0x12327928> 100e8: 1b0000ef jal 10298 <efficient_int_mul> 100ec: 00050493 mv s1,a0 100f0: 00058413 mv s0,a1 100f4: 0ec000ef jal 101e0 <get_cycles> 100f8: 413505b3 sub a1,a0,s3 100fc: 0001c537 lui a0,0x1c 10100: bf050513 add a0,a0,-1040 # 1bbf0 <__clzsi2+0x72> 10104: 63c000ef jal 10740 <printf> 10108: 0001c537 lui a0,0x1c 1010c: 00090593 mv a1,s2 10110: c0450513 add a0,a0,-1020 # 1bc04 <__clzsi2+0x86> 10114: 62c000ef jal 10740 <printf> 10118: 0001c537 lui a0,0x1c 1011c: 00048613 mv a2,s1 10120: 00040693 mv a3,s0 10124: c1450513 add a0,a0,-1004 # 1bc14 <__clzsi2+0x96> 10128: 618000ef jal 10740 <printf> 1012c: 01c12083 lw ra,28(sp) 10130: 01812403 lw s0,24(sp) 10134: 01412483 lw s1,20(sp) 10138: 01012903 lw s2,16(sp) 1013c: 00c12983 lw s3,12(sp) 10140: 00000513 li a0,0 10144: 02010113 add sp,sp,32 10148: 00008067 ret ``` #### elf size ![](https://hackmd.io/_uploads/rk82JoA-T.png) #### execute ![](https://hackmd.io/_uploads/rJSoyiCZT.png) - observe - Total cycle count : `421` - Total instruction count : `2c7` - Register use : `$ra, $sp, $a0~$a7, $s0~$s3, $t1` ### -O3 Assembly code ```asm= 00010208 <CLZ_32>: 10208: 00155793 srl a5,a0,0x1 1020c: 00a7e533 or a0,a5,a0 10210: 00255793 srl a5,a0,0x2 10214: 00a7e7b3 or a5,a5,a0 10218: 0047d513 srl a0,a5,0x4 1021c: 00f56533 or a0,a0,a5 10220: 00855713 srl a4,a0,0x8 10224: 00a76733 or a4,a4,a0 10228: 01075793 srl a5,a4,0x10 1022c: 00e7e7b3 or a5,a5,a4 10230: 555556b7 lui a3,0x55555 10234: 0017d713 srl a4,a5,0x1 10238: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 1023c: 00d77733 and a4,a4,a3 10240: 40e787b3 sub a5,a5,a4 10244: 333336b7 lui a3,0x33333 10248: 33368693 add a3,a3,819 # 33333333 <__BSS_END__+0x333155e3> 1024c: 0027d713 srl a4,a5,0x2 10250: 00d77733 and a4,a4,a3 10254: 00d7f7b3 and a5,a5,a3 10258: 00f70733 add a4,a4,a5 1025c: 00475793 srl a5,a4,0x4 10260: 0f0f16b7 lui a3,0xf0f1 10264: 00e787b3 add a5,a5,a4 10268: f0f68693 add a3,a3,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 1026c: 00d7f7b3 and a5,a5,a3 10270: 0087d713 srl a4,a5,0x8 10274: 00f70733 add a4,a4,a5 10278: 01075793 srl a5,a4,0x10 1027c: 00e787b3 add a5,a5,a4 10280: 03f7f793 and a5,a5,63 10284: 02000513 li a0,32 10288: 40f50533 sub a0,a0,a5 1028c: 01051513 sll a0,a0,0x10 10290: 01055513 srl a0,a0,0x10 10294: 00008067 ret 00010298 <efficient_int_mul>: 10298: 00155713 srl a4,a0,0x1 1029c: 0015d793 srl a5,a1,0x1 102a0: 00a76733 or a4,a4,a0 102a4: 00b7e7b3 or a5,a5,a1 102a8: 0027d693 srl a3,a5,0x2 102ac: 00058613 mv a2,a1 102b0: 00275593 srl a1,a4,0x2 102b4: 00b76733 or a4,a4,a1 102b8: 00d7e7b3 or a5,a5,a3 102bc: 00475593 srl a1,a4,0x4 102c0: 0047d693 srl a3,a5,0x4 102c4: 00b76733 or a4,a4,a1 102c8: 00d7e7b3 or a5,a5,a3 102cc: 00875593 srl a1,a4,0x8 102d0: 0087d693 srl a3,a5,0x8 102d4: 00b76733 or a4,a4,a1 102d8: 00d7e7b3 or a5,a5,a3 102dc: 01075593 srl a1,a4,0x10 102e0: 0107d693 srl a3,a5,0x10 102e4: 00b76733 or a4,a4,a1 102e8: 00d7e7b3 or a5,a5,a3 102ec: 555556b7 lui a3,0x55555 102f0: 0017d593 srl a1,a5,0x1 102f4: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 102f8: 00050813 mv a6,a0 102fc: 00175513 srl a0,a4,0x1 10300: 00d57533 and a0,a0,a3 10304: 00d5f6b3 and a3,a1,a3 10308: 40a70733 sub a4,a4,a0 1030c: 40d787b3 sub a5,a5,a3 10310: 33333537 lui a0,0x33333 10314: 33350513 add a0,a0,819 # 33333333 <__BSS_END__+0x333155e3> 10318: 00275593 srl a1,a4,0x2 1031c: 0027d693 srl a3,a5,0x2 10320: 00a77733 and a4,a4,a0 10324: 00a7f7b3 and a5,a5,a0 10328: 00a5f5b3 and a1,a1,a0 1032c: 00a6f6b3 and a3,a3,a0 10330: 00e585b3 add a1,a1,a4 10334: 00f686b3 add a3,a3,a5 10338: 0046d713 srl a4,a3,0x4 1033c: 0045d793 srl a5,a1,0x4 10340: 0f0f1537 lui a0,0xf0f1 10344: f0f50513 add a0,a0,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 10348: 00b787b3 add a5,a5,a1 1034c: 00d70733 add a4,a4,a3 10350: 00a7f7b3 and a5,a5,a0 10354: 00a77733 and a4,a4,a0 10358: 0087d593 srl a1,a5,0x8 1035c: 00875693 srl a3,a4,0x8 10360: 00b787b3 add a5,a5,a1 10364: 00d70733 add a4,a4,a3 10368: 01075693 srl a3,a4,0x10 1036c: 0107d593 srl a1,a5,0x10 10370: 00d70733 add a4,a4,a3 10374: 00b787b3 add a5,a5,a1 10378: 02000693 li a3,32 1037c: 03f7f793 and a5,a5,63 10380: 03f77713 and a4,a4,63 10384: 40f687b3 sub a5,a3,a5 10388: 40e68733 sub a4,a3,a4 1038c: 01079793 sll a5,a5,0x10 10390: 01071713 sll a4,a4,0x10 10394: 0107d793 srl a5,a5,0x10 10398: 01075713 srl a4,a4,0x10 1039c: 00f77a63 bgeu a4,a5,103b0 <efficient_int_mul+0x118> 103a0: 00060693 mv a3,a2 103a4: 00078713 mv a4,a5 103a8: 00080613 mv a2,a6 103ac: 00068813 mv a6,a3 103b0: 02000893 li a7,32 103b4: 40e888b3 sub a7,a7,a4 103b8: 07105463 blez a7,10420 <efficient_int_mul+0x188> 103bc: 00000793 li a5,0 103c0: 00000513 li a0,0 103c4: 00000593 li a1,0 103c8: 00185e93 srl t4,a6,0x1 103cc: 01f00e13 li t3,31 103d0: 0280006f j 103f8 <efficient_int_mul+0x160> 103d4: 00d816b3 sll a3,a6,a3 103d8: 00000713 li a4,0 103dc: 00e50733 add a4,a0,a4 103e0: 00a73333 sltu t1,a4,a0 103e4: 00d585b3 add a1,a1,a3 103e8: 00070513 mv a0,a4 103ec: 00b305b3 add a1,t1,a1 103f0: 00178793 add a5,a5,1 103f4: 02f88463 beq a7,a5,1041c <efficient_int_mul+0x184> 103f8: 00f65733 srl a4,a2,a5 103fc: 00177713 and a4,a4,1 10400: fe078693 add a3,a5,-32 10404: fe0706e3 beqz a4,103f0 <efficient_int_mul+0x158> 10408: 40fe0333 sub t1,t3,a5 1040c: fc06d4e3 bgez a3,103d4 <efficient_int_mul+0x13c> 10410: 00f81733 sll a4,a6,a5 10414: 006ed6b3 srl a3,t4,t1 10418: fc5ff06f j 103dc <efficient_int_mul+0x144> 1041c: 00008067 ret 10420: 00000513 li a0,0 10424: 00000593 li a1,0 10428: 00008067 ret 000100b0 <main>: 100b0: fe010113 add sp,sp,-32 100b4: 00112e23 sw ra,28(sp) 100b8: 00812c23 sw s0,24(sp) 100bc: 00912a23 sw s1,20(sp) 100c0: 01212823 sw s2,16(sp) 100c4: 01312623 sw s3,12(sp) 100c8: 12c000ef jal 101f4 <get_instret> 100cc: 00050913 mv s2,a0 100d0: 110000ef jal 101e0 <get_cycles> 100d4: 00050993 mv s3,a0 100d8: ffffe5b7 lui a1,0xffffe 100dc: 12345537 lui a0,0x12345 100e0: ddd58593 add a1,a1,-547 # ffffdddd <__BSS_END__+0xfffe008d> 100e4: 67850513 add a0,a0,1656 # 12345678 <__BSS_END__+0x12327928> 100e8: 1b0000ef jal 10298 <efficient_int_mul> 100ec: 00050493 mv s1,a0 100f0: 00058413 mv s0,a1 100f4: 0ec000ef jal 101e0 <get_cycles> 100f8: 413505b3 sub a1,a0,s3 100fc: 0001c537 lui a0,0x1c 10100: cb850513 add a0,a0,-840 # 1bcb8 <__clzsi2+0x72> 10104: 704000ef jal 10808 <printf> 10108: 0001c537 lui a0,0x1c 1010c: 00090593 mv a1,s2 10110: ccc50513 add a0,a0,-820 # 1bccc <__clzsi2+0x86> 10114: 6f4000ef jal 10808 <printf> 10118: 0001c537 lui a0,0x1c 1011c: 00048613 mv a2,s1 10120: 00040693 mv a3,s0 10124: cdc50513 add a0,a0,-804 # 1bcdc <__clzsi2+0x96> 10128: 6e0000ef jal 10808 <printf> 1012c: 01c12083 lw ra,28(sp) 10130: 01812403 lw s0,24(sp) 10134: 01412483 lw s1,20(sp) 10138: 01012903 lw s2,16(sp) 1013c: 00c12983 lw s3,12(sp) 10140: 00000513 li a0,0 10144: 02010113 add sp,sp,32 10148: 00008067 ret ``` #### elf size ![](https://hackmd.io/_uploads/r15txoRWa.png) #### execute ![](https://hackmd.io/_uploads/ryouesAWT.png) - observe - Total cycle count : `398` - Total instruction count : `2c7` - Register use : `$ra, $sp, $a0~$a7, $s0~$s3, $t1, $t3, $t4` ### -Os Assembly code ```asm= 000101f8 <CLZ_32>: 101f8: 00155793 srl a5,a0,0x1 101fc: 00a7e533 or a0,a5,a0 10200: 00255793 srl a5,a0,0x2 10204: 00a7e7b3 or a5,a5,a0 10208: 0047d513 srl a0,a5,0x4 1020c: 00f56533 or a0,a0,a5 10210: 00855713 srl a4,a0,0x8 10214: 00a76733 or a4,a4,a0 10218: 01075793 srl a5,a4,0x10 1021c: 00e7e7b3 or a5,a5,a4 10220: 555556b7 lui a3,0x55555 10224: 0017d713 srl a4,a5,0x1 10228: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 1022c: 00d77733 and a4,a4,a3 10230: 40e787b3 sub a5,a5,a4 10234: 333336b7 lui a3,0x33333 10238: 33368693 add a3,a3,819 # 33333333 <__BSS_END__+0x333155e3> 1023c: 0027d713 srl a4,a5,0x2 10240: 00d77733 and a4,a4,a3 10244: 00d7f7b3 and a5,a5,a3 10248: 00f70733 add a4,a4,a5 1024c: 00475793 srl a5,a4,0x4 10250: 00e787b3 add a5,a5,a4 10254: 0f0f1737 lui a4,0xf0f1 10258: f0f70713 add a4,a4,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 1025c: 00e7f7b3 and a5,a5,a4 10260: 0087d713 srl a4,a5,0x8 10264: 00f70733 add a4,a4,a5 10268: 01075793 srl a5,a4,0x10 1026c: 00e787b3 add a5,a5,a4 10270: 03f7f793 and a5,a5,63 10274: 02000513 li a0,32 10278: 40f50533 sub a0,a0,a5 1027c: 01051513 sll a0,a0,0x10 10280: 01055513 srl a0,a0,0x10 10284: 00008067 ret 00010288 <efficient_int_mul>: 10288: fe010113 add sp,sp,-32 1028c: 00812c23 sw s0,24(sp) 10290: 00912a23 sw s1,20(sp) 10294: 01312623 sw s3,12(sp) 10298: 00112e23 sw ra,28(sp) 1029c: 01212823 sw s2,16(sp) 102a0: 01412423 sw s4,8(sp) 102a4: 01512223 sw s5,4(sp) 102a8: 00058413 mv s0,a1 102ac: 00050993 mv s3,a0 102b0: f49ff0ef jal 101f8 <CLZ_32> 102b4: 00050493 mv s1,a0 102b8: 00040513 mv a0,s0 102bc: f3dff0ef jal 101f8 <CLZ_32> 102c0: 00957a63 bgeu a0,s1,102d4 <efficient_int_mul+0x4c> 102c4: 00040793 mv a5,s0 102c8: 00048513 mv a0,s1 102cc: 00098413 mv s0,s3 102d0: 00078993 mv s3,a5 102d4: 02000a93 li s5,32 102d8: 00000913 li s2,0 102dc: 00000a13 li s4,0 102e0: 00000493 li s1,0 102e4: 40aa8ab3 sub s5,s5,a0 102e8: 03594863 blt s2,s5,10318 <efficient_int_mul+0x90> 102ec: 01c12083 lw ra,28(sp) 102f0: 01812403 lw s0,24(sp) 102f4: 01012903 lw s2,16(sp) 102f8: 00c12983 lw s3,12(sp) 102fc: 00412a83 lw s5,4(sp) 10300: 000a0513 mv a0,s4 10304: 00048593 mv a1,s1 10308: 00812a03 lw s4,8(sp) 1030c: 01412483 lw s1,20(sp) 10310: 02010113 add sp,sp,32 10314: 00008067 ret 10318: 012457b3 srl a5,s0,s2 1031c: 0017f793 and a5,a5,1 10320: 02078463 beqz a5,10348 <efficient_int_mul+0xc0> 10324: 00090613 mv a2,s2 10328: 00098513 mv a0,s3 1032c: 00000593 li a1,0 10330: 020000ef jal 10350 <__ashldi3> 10334: 00aa0533 add a0,s4,a0 10338: 014537b3 sltu a5,a0,s4 1033c: 00b484b3 add s1,s1,a1 10340: 00050a13 mv s4,a0 10344: 009784b3 add s1,a5,s1 10348: 00190913 add s2,s2,1 1034c: f9dff06f j 102e8 <efficient_int_mul+0x60> 000100b0 <main>: 100b0: fe010113 add sp,sp,-32 100b4: 00112e23 sw ra,28(sp) 100b8: 00812c23 sw s0,24(sp) 100bc: 00912a23 sw s1,20(sp) 100c0: 124000ef jal 101e4 <get_instret> 100c4: 00050413 mv s0,a0 100c8: 108000ef jal 101d0 <get_cycles> 100cc: 00050493 mv s1,a0 100d0: ffffe5b7 lui a1,0xffffe 100d4: 12345537 lui a0,0x12345 100d8: ddd58593 add a1,a1,-547 # ffffdddd <__BSS_END__+0xfffe008d> 100dc: 67850513 add a0,a0,1656 # 12345678 <__BSS_END__+0x12327928> 100e0: 1a8000ef jal 10288 <efficient_int_mul> 100e4: 00a12623 sw a0,12(sp) 100e8: 00b12423 sw a1,8(sp) 100ec: 0e4000ef jal 101d0 <get_cycles> 100f0: 409505b3 sub a1,a0,s1 100f4: 0001c537 lui a0,0x1c 100f8: c0050513 add a0,a0,-1024 # 1bc00 <__clzsi2+0x70> 100fc: 656000ef jal 10752 <printf> 10100: 0001c537 lui a0,0x1c 10104: 00040593 mv a1,s0 10108: c1450513 add a0,a0,-1004 # 1bc14 <__clzsi2+0x84> 1010c: 646000ef jal 10752 <printf> 10110: 00c12603 lw a2,12(sp) 10114: 00812683 lw a3,8(sp) 10118: 0001c537 lui a0,0x1c 1011c: c2450513 add a0,a0,-988 # 1bc24 <__clzsi2+0x94> 10120: 632000ef jal 10752 <printf> 10124: 01c12083 lw ra,28(sp) 10128: 01812403 lw s0,24(sp) 1012c: 01412483 lw s1,20(sp) 10130: 00000513 li a0,0 10134: 02010113 add sp,sp,32 10138: 00008067 ret ``` #### elf size ![](https://hackmd.io/_uploads/Byox-j0Z6.png) #### execute ![](https://hackmd.io/_uploads/Hysk-jCZT.png) - observe - Total cycle count : `530` - Total instruction count : `2c5` - Register use : `$ra, $sp, $a0~$a7, $s0~$s3` ### -Ofast Assembly code ```asm= 00010208 <CLZ_32>: 10208: 00155793 srl a5,a0,0x1 1020c: 00a7e533 or a0,a5,a0 10210: 00255793 srl a5,a0,0x2 10214: 00a7e7b3 or a5,a5,a0 10218: 0047d513 srl a0,a5,0x4 1021c: 00f56533 or a0,a0,a5 10220: 00855713 srl a4,a0,0x8 10224: 00a76733 or a4,a4,a0 10228: 01075793 srl a5,a4,0x10 1022c: 00e7e7b3 or a5,a5,a4 10230: 555556b7 lui a3,0x55555 10234: 0017d713 srl a4,a5,0x1 10238: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 1023c: 00d77733 and a4,a4,a3 10240: 40e787b3 sub a5,a5,a4 10244: 333336b7 lui a3,0x33333 10248: 33368693 add a3,a3,819 # 33333333 <__BSS_END__+0x333155e3> 1024c: 0027d713 srl a4,a5,0x2 10250: 00d77733 and a4,a4,a3 10254: 00d7f7b3 and a5,a5,a3 10258: 00f70733 add a4,a4,a5 1025c: 00475793 srl a5,a4,0x4 10260: 0f0f16b7 lui a3,0xf0f1 10264: 00e787b3 add a5,a5,a4 10268: f0f68693 add a3,a3,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 1026c: 00d7f7b3 and a5,a5,a3 10270: 0087d713 srl a4,a5,0x8 10274: 00f70733 add a4,a4,a5 10278: 01075793 srl a5,a4,0x10 1027c: 00e787b3 add a5,a5,a4 10280: 03f7f793 and a5,a5,63 10284: 02000513 li a0,32 10288: 40f50533 sub a0,a0,a5 1028c: 01051513 sll a0,a0,0x10 10290: 01055513 srl a0,a0,0x10 10294: 00008067 ret 00010298 <efficient_int_mul>: 10298: 00155713 srl a4,a0,0x1 1029c: 0015d793 srl a5,a1,0x1 102a0: 00a76733 or a4,a4,a0 102a4: 00b7e7b3 or a5,a5,a1 102a8: 0027d693 srl a3,a5,0x2 102ac: 00058613 mv a2,a1 102b0: 00275593 srl a1,a4,0x2 102b4: 00b76733 or a4,a4,a1 102b8: 00d7e7b3 or a5,a5,a3 102bc: 00475593 srl a1,a4,0x4 102c0: 0047d693 srl a3,a5,0x4 102c4: 00b76733 or a4,a4,a1 102c8: 00d7e7b3 or a5,a5,a3 102cc: 00875593 srl a1,a4,0x8 102d0: 0087d693 srl a3,a5,0x8 102d4: 00b76733 or a4,a4,a1 102d8: 00d7e7b3 or a5,a5,a3 102dc: 01075593 srl a1,a4,0x10 102e0: 0107d693 srl a3,a5,0x10 102e4: 00b76733 or a4,a4,a1 102e8: 00d7e7b3 or a5,a5,a3 102ec: 555556b7 lui a3,0x55555 102f0: 0017d593 srl a1,a5,0x1 102f4: 55568693 add a3,a3,1365 # 55555555 <__BSS_END__+0x55537805> 102f8: 00050813 mv a6,a0 102fc: 00175513 srl a0,a4,0x1 10300: 00d57533 and a0,a0,a3 10304: 00d5f6b3 and a3,a1,a3 10308: 40a70733 sub a4,a4,a0 1030c: 40d787b3 sub a5,a5,a3 10310: 33333537 lui a0,0x33333 10314: 33350513 add a0,a0,819 # 33333333 <__BSS_END__+0x333155e3> 10318: 00275593 srl a1,a4,0x2 1031c: 0027d693 srl a3,a5,0x2 10320: 00a77733 and a4,a4,a0 10324: 00a7f7b3 and a5,a5,a0 10328: 00a5f5b3 and a1,a1,a0 1032c: 00a6f6b3 and a3,a3,a0 10330: 00e585b3 add a1,a1,a4 10334: 00f686b3 add a3,a3,a5 10338: 0046d713 srl a4,a3,0x4 1033c: 0045d793 srl a5,a1,0x4 10340: 0f0f1537 lui a0,0xf0f1 10344: f0f50513 add a0,a0,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 10348: 00b787b3 add a5,a5,a1 1034c: 00d70733 add a4,a4,a3 10350: 00a7f7b3 and a5,a5,a0 10354: 00a77733 and a4,a4,a0 10358: 0087d593 srl a1,a5,0x8 1035c: 00875693 srl a3,a4,0x8 10360: 00b787b3 add a5,a5,a1 10364: 00d70733 add a4,a4,a3 10368: 01075693 srl a3,a4,0x10 1036c: 0107d593 srl a1,a5,0x10 10370: 00d70733 add a4,a4,a3 10374: 00b787b3 add a5,a5,a1 10378: 02000693 li a3,32 1037c: 03f7f793 and a5,a5,63 10380: 03f77713 and a4,a4,63 10384: 40f687b3 sub a5,a3,a5 10388: 40e68733 sub a4,a3,a4 1038c: 01079793 sll a5,a5,0x10 10390: 01071713 sll a4,a4,0x10 10394: 0107d793 srl a5,a5,0x10 10398: 01075713 srl a4,a4,0x10 1039c: 00f77a63 bgeu a4,a5,103b0 <efficient_int_mul+0x118> 103a0: 00060693 mv a3,a2 103a4: 00078713 mv a4,a5 103a8: 00080613 mv a2,a6 103ac: 00068813 mv a6,a3 103b0: 02000893 li a7,32 103b4: 40e888b3 sub a7,a7,a4 103b8: 07105463 blez a7,10420 <efficient_int_mul+0x188> 103bc: 00000793 li a5,0 103c0: 00000513 li a0,0 103c4: 00000593 li a1,0 103c8: 00185e93 srl t4,a6,0x1 103cc: 01f00e13 li t3,31 103d0: 0280006f j 103f8 <efficient_int_mul+0x160> 103d4: 00d816b3 sll a3,a6,a3 103d8: 00000713 li a4,0 103dc: 00e50733 add a4,a0,a4 103e0: 00a73333 sltu t1,a4,a0 103e4: 00d585b3 add a1,a1,a3 103e8: 00070513 mv a0,a4 103ec: 00b305b3 add a1,t1,a1 103f0: 00178793 add a5,a5,1 103f4: 02f88463 beq a7,a5,1041c <efficient_int_mul+0x184> 103f8: 00f65733 srl a4,a2,a5 103fc: 00177713 and a4,a4,1 10400: fe078693 add a3,a5,-32 10404: fe0706e3 beqz a4,103f0 <efficient_int_mul+0x158> 10408: 40fe0333 sub t1,t3,a5 1040c: fc06d4e3 bgez a3,103d4 <efficient_int_mul+0x13c> 10410: 00f81733 sll a4,a6,a5 10414: 006ed6b3 srl a3,t4,t1 10418: fc5ff06f j 103dc <efficient_int_mul+0x144> 1041c: 00008067 ret 10420: 00000513 li a0,0 10424: 00000593 li a1,0 10428: 00008067 ret 000100b0 <main>: 100b0: fe010113 add sp,sp,-32 100b4: 00112e23 sw ra,28(sp) 100b8: 00812c23 sw s0,24(sp) 100bc: 00912a23 sw s1,20(sp) 100c0: 01212823 sw s2,16(sp) 100c4: 01312623 sw s3,12(sp) 100c8: 12c000ef jal 101f4 <get_instret> 100cc: 00050913 mv s2,a0 100d0: 110000ef jal 101e0 <get_cycles> 100d4: 00050993 mv s3,a0 100d8: ffffe5b7 lui a1,0xffffe 100dc: 12345537 lui a0,0x12345 100e0: ddd58593 add a1,a1,-547 # ffffdddd <__BSS_END__+0xfffe008d> 100e4: 67850513 add a0,a0,1656 # 12345678 <__BSS_END__+0x12327928> 100e8: 1b0000ef jal 10298 <efficient_int_mul> 100ec: 00050493 mv s1,a0 100f0: 00058413 mv s0,a1 100f4: 0ec000ef jal 101e0 <get_cycles> 100f8: 413505b3 sub a1,a0,s3 100fc: 0001c537 lui a0,0x1c 10100: cb850513 add a0,a0,-840 # 1bcb8 <__clzsi2+0x72> 10104: 704000ef jal 10808 <printf> 10108: 0001c537 lui a0,0x1c 1010c: 00090593 mv a1,s2 10110: ccc50513 add a0,a0,-820 # 1bccc <__clzsi2+0x86> 10114: 6f4000ef jal 10808 <printf> 10118: 0001c537 lui a0,0x1c 1011c: 00048613 mv a2,s1 10120: 00040693 mv a3,s0 10124: cdc50513 add a0,a0,-804 # 1bcdc <__clzsi2+0x96> 10128: 6e0000ef jal 10808 <printf> 1012c: 01c12083 lw ra,28(sp) 10130: 01812403 lw s0,24(sp) 10134: 01412483 lw s1,20(sp) 10138: 01012903 lw s2,16(sp) 1013c: 00c12983 lw s3,12(sp) 10140: 00000513 li a0,0 10144: 02010113 add sp,sp,32 10148: 00008067 ret ``` #### elf size ![](https://hackmd.io/_uploads/SyB0biAba.png) #### execute ![](https://hackmd.io/_uploads/HyCnZjRWp.png) - observe - Total cycle count : `398` - Total instruction count : `2c7` - Register use : `$ra, $sp, $a0~$a7, $s0~$s3, $t1, $t4` ## Conclusion |-|-O1|-O2|-O3|-Os|-Ofast| |:-:|:-:|:-:|:-:|:-:|:-:| |instret|2c7|2c7|2c7|**2c5**|2c7| |Cycle|**393**|421|398|530|398| |Code lines|132|131|181|**126**|181| - Although the `instruction count` is similar between -`O1` and `-Ofast`, the cycle count is significantly reduced in `-O1`. - `-Os` incurs the highest `cycle`, while `-O1` has the lowest, with a `decrease from 530 cycles to 393 cycles.` - `-O3` and `-Ofast` use the most lines of code, while `-Os` uses the fewest.