From 83ed3f519de9929b6551b98677047228a8ab4d0c Mon Sep 17 00:00:00 2001 From: Yunsup Lee Date: Mon, 14 Apr 2014 21:15:15 -0700 Subject: [PATCH] commit high-performance mm (scalar and vector versions) --- benchmarks/Makefile | 4 +- benchmarks/common/crt.S | 17 +-- benchmarks/common/syscalls.c | 7 +- benchmarks/common/util.h | 8 +- benchmarks/dgemm/bmark.mk | 30 ---- benchmarks/dgemm/dataset1.h | 97 ------------- benchmarks/dgemm/dgemm_gendata.scala | 37 ----- benchmarks/dgemm/dgemm_main.c | 122 ---------------- benchmarks/mm/bmark.mk | 32 ++++ benchmarks/mm/common.h | 35 +++++ benchmarks/mm/gen.scala | 81 +++++++++++ benchmarks/mm/hwacha.S | 92 ++++++++++++ benchmarks/mm/hwacha.h | 138 ++++++++++++++++++ benchmarks/mm/mm.c | 150 +++++++++++++++++++ benchmarks/mm/mm_main.c | 80 ++++++++++ benchmarks/mm/rb.h | 210 +++++++++++++++++++++++++++ benchmarks/mt-matmul/mt-matmul.c | 15 +- benchmarks/mt-vvadd/mt-vvadd.c | 17 +-- 18 files changed, 847 insertions(+), 325 deletions(-) delete mode 100644 benchmarks/dgemm/bmark.mk delete mode 100644 benchmarks/dgemm/dataset1.h delete mode 100644 benchmarks/dgemm/dgemm_gendata.scala delete mode 100644 benchmarks/dgemm/dgemm_main.c create mode 100644 benchmarks/mm/bmark.mk create mode 100644 benchmarks/mm/common.h create mode 100644 benchmarks/mm/gen.scala create mode 100644 benchmarks/mm/hwacha.S create mode 100644 benchmarks/mm/hwacha.h create mode 100644 benchmarks/mm/mm.c create mode 100644 benchmarks/mm/mm_main.c create mode 100644 benchmarks/mm/rb.h diff --git a/benchmarks/Makefile b/benchmarks/Makefile index 175cf99..241a933 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -22,7 +22,7 @@ bmarks = \ towers \ vvadd \ multiply \ - dgemm \ + mm \ dhrystone \ spmv \ mt-vvadd \ @@ -38,7 +38,7 @@ bmarks_host = \ towers \ vvadd \ multiply \ - dgemm \ + mm \ spmv \ vec-vvadd \ vec-cmplxmult \ diff --git a/benchmarks/common/crt.S b/benchmarks/common/crt.S index b273900..82cad93 100644 --- a/benchmarks/common/crt.S +++ b/benchmarks/common/crt.S @@ -8,14 +8,6 @@ # define SREG sw #endif - .data - .globl _heapend - .globl environ -_heapend: - .word 0 -environ: - .word 0 - .text .globl _start @@ -62,10 +54,13 @@ _start: li a0, SR_EF | SR_EA csrs status, a0 - ## if that didn't stick, we don't have an FPU, so don't initialize it csrr t0, status - and t0, t0, SR_EF - beqz t0, 1f + and t1, t0, SR_EA + sw t1, have_vec, t2 + + ## if that didn't stick, we don't have an FPU, so don't initialize it + and t1, t0, SR_EF + beqz t1, 1f fssr x0 fmv.s.x f0, x0 diff --git a/benchmarks/common/syscalls.c b/benchmarks/common/syscalls.c index a168ebf..e9f04e2 100644 --- a/benchmarks/common/syscalls.c +++ b/benchmarks/common/syscalls.c @@ -8,6 +8,9 @@ #define SYS_stats 1234 +// initialized in crt.S +int have_vec; + static long handle_frontend_syscall(long which, long arg0, long arg1, long arg2) { volatile uint64_t magic_mem[8] __attribute__((aligned(64))); @@ -150,8 +153,8 @@ void _init(int cid, int nc) #undef putchar int putchar(int ch) { - static char buf[64] __attribute__((aligned(64))); - static int buflen = 0; + static __thread char buf[64] __attribute__((aligned(64))); + static __thread int buflen = 0; buf[buflen++] = ch; diff --git a/benchmarks/common/util.h b/benchmarks/common/util.h index 7c2a048..6c4f963 100644 --- a/benchmarks/common/util.h +++ b/benchmarks/common/util.h @@ -31,6 +31,8 @@ static void setStats(int enable) {} extern void setStats(int enable); #endif +extern int have_vec; + #define static_assert(cond) switch(0) { case 0: case !!(long)(cond): ; } static void printArray(const char name[], int n, const int arr[]) @@ -87,11 +89,7 @@ static int verifyDouble(int n, const volatile double* test, const double* verify return 0; } -#ifndef ncores -#define ncores 1 -#endif - -static void __attribute__((noinline)) barrier() +static void __attribute__((noinline)) barrier(int ncores) { static volatile int sense; static volatile int count; diff --git a/benchmarks/dgemm/bmark.mk b/benchmarks/dgemm/bmark.mk deleted file mode 100644 index 11d8656..0000000 --- a/benchmarks/dgemm/bmark.mk +++ /dev/null @@ -1,30 +0,0 @@ -#======================================================================= -# UCB CS250 Makefile fragment for benchmarks -#----------------------------------------------------------------------- -# -# Each benchmark directory should have its own fragment which -# essentially lists what the source files are and how to link them -# into an riscv and/or host executable. All variables should include -# the benchmark name as a prefix so that they are unique. -# - -dgemm_c_src = \ - dgemm_main.c \ - syscalls.c \ - -dgemm_riscv_src = \ - crt.S \ - -dgemm_c_objs = $(patsubst %.c, %.o, $(dgemm_c_src)) -dgemm_riscv_objs = $(patsubst %.S, %.o, $(dgemm_riscv_src)) - -dgemm_host_bin = dgemm.host -$(dgemm_host_bin) : $(dgemm_c_src) - $(HOST_COMP) $^ -o $(dgemm_host_bin) - -dgemm_riscv_bin = dgemm.riscv -$(dgemm_riscv_bin) : $(dgemm_c_objs) $(dgemm_riscv_objs) - $(RISCV_LINK) $(dgemm_c_objs) $(dgemm_riscv_objs) -o $(dgemm_riscv_bin) $(RISCV_LINK_OPTS) - -junk += $(dgemm_c_objs) $(dgemm_riscv_objs) \ - $(dgemm_host_bin) $(dgemm_riscv_bin) diff --git a/benchmarks/dgemm/dataset1.h b/benchmarks/dgemm/dataset1.h deleted file mode 100644 index 9db066e..0000000 --- a/benchmarks/dgemm/dataset1.h +++ /dev/null @@ -1,97 +0,0 @@ -#define DATA_SIZE 30 -const double input1_data[DATA_SIZE*DATA_SIZE] = { -745.0, 504.0, 772.0, 818.0, 443.0, 308.0, 823.0, 523.0, 93.0, 379.0, 728.0, 543.0, 40.0, 482.0, 728.0, 487.0, 144.0, 486.0, 109.0, 994.0, 373.0, 257.0, 196.0, 145.0, 234.0, 301.0, 639.0, 379.0, 913.0, 420.0, -264.0, 636.0, 873.0, 677.0, 330.0, 928.0, 30.0, 603.0, 96.0, 510.0, 196.0, 55.0, 702.0, 663.0, 151.0, 526.0, 624.0, 598.0, 529.0, 926.0, 914.0, 641.0, 401.0, 146.0, 756.0, 550.0, 92.0, 452.0, 786.0, 417.0, -115.0, 202.0, 806.0, 841.0, 657.0, 33.0, 66.0, 595.0, 751.0, 302.0, 70.0, 453.0, 318.0, 580.0, 114.0, 85.0, 585.0, 959.0, 393.0, 810.0, 276.0, 839.0, 58.0, 632.0, 940.0, 568.0, 676.0, 625.0, 861.0, 702.0, -413.0, 79.0, 762.0, 494.0, 695.0, 774.0, 877.0, 968.0, 327.0, 742.0, 163.0, 353.0, 692.0, 870.0, 634.0, 60.0, 545.0, 300.0, 625.0, 48.0, 390.0, 713.0, 661.0, 613.0, 673.0, 89.0, 116.0, 472.0, 837.0, 864.0, -256.0, 542.0, 660.0, 768.0, 474.0, 229.0, 783.0, 583.0, 975.0, 278.0, 838.0, 557.0, 372.0, 815.0, 94.0, 820.0, 713.0, 685.0, 606.0, 304.0, 549.0, 150.0, 237.0, 981.0, 111.0, 85.0, 741.0, 960.0, 499.0, 110.0, -540.0, 414.0, 153.0, 809.0, 477.0, 176.0, 46.0, 27.0, 106.0, 704.0, 709.0, 728.0, 355.0, 934.0, 38.0, 974.0, 744.0, 651.0, 169.0, 514.0, 550.0, 742.0, 456.0, 453.0, 106.0, 956.0, 374.0, 945.0, 688.0, 594.0, -983.0, 686.0, 86.0, 247.0, 389.0, 914.0, 378.0, 837.0, 556.0, 332.0, 884.0, 102.0, 651.0, 329.0, 305.0, 874.0, 863.0, 752.0, 94.0, 102.0, 878.0, 200.0, 645.0, 601.0, 573.0, 369.0, 247.0, 241.0, 158.0, 647.0, -166.0, 139.0, 810.0, 531.0, 118.0, 750.0, 759.0, 621.0, 87.0, 472.0, 846.0, 644.0, 209.0, 515.0, 172.0, 565.0, 685.0, 344.0, 850.0, 218.0, 788.0, 323.0, 867.0, 809.0, 991.0, 806.0, 617.0, 878.0, 937.0, 816.0, -517.0, 811.0, 181.0, 590.0, 705.0, 691.0, 847.0, 233.0, 652.0, 374.0, 570.0, 160.0, 873.0, 78.0, 246.0, 652.0, 876.0, 145.0, 587.0, 729.0, 467.0, 111.0, 590.0, 653.0, 972.0, 987.0, 231.0, 809.0, 456.0, 887.0, -861.0, 60.0, 588.0, 71.0, 519.0, 479.0, 640.0, 608.0, 336.0, 259.0, 7.0, 578.0, 53.0, 823.0, 305.0, 911.0, 230.0, 445.0, 216.0, 696.0, 278.0, 804.0, 413.0, 333.0, 409.0, 632.0, 86.0, 401.0, 226.0, 93.0, -815.0, 177.0, 894.0, 51.0, 441.0, 785.0, 888.0, 915.0, 347.0, 55.0, 762.0, 896.0, 964.0, 539.0, 572.0, 889.0, 275.0, 43.0, 220.0, 195.0, 963.0, 342.0, 915.0, 651.0, 750.0, 286.0, 632.0, 168.0, 652.0, 880.0, -803.0, 439.0, 112.0, 544.0, 624.0, 656.0, 679.0, 117.0, 413.0, 798.0, 230.0, 571.0, 106.0, 36.0, 656.0, 848.0, 733.0, 931.0, 513.0, 614.0, 302.0, 776.0, 401.0, 703.0, 510.0, 682.0, 280.0, 351.0, 79.0, 353.0, -106.0, 355.0, 343.0, 802.0, 232.0, 583.0, 103.0, 663.0, 683.0, 37.0, 130.0, 795.0, 261.0, 202.0, 949.0, 739.0, 926.0, 930.0, 522.0, 872.0, 567.0, 724.0, 0.0, 385.0, 191.0, 704.0, 586.0, 974.0, 944.0, 100.0, -506.0, 903.0, 325.0, 622.0, 218.0, 842.0, 298.0, 676.0, 503.0, 4.0, 784.0, 63.0, 195.0, 495.0, 740.0, 518.0, 845.0, 649.0, 730.0, 287.0, 231.0, 477.0, 939.0, 472.0, 324.0, 459.0, 784.0, 988.0, 572.0, 338.0, -114.0, 993.0, 965.0, 134.0, 608.0, 613.0, 527.0, 645.0, 324.0, 497.0, 949.0, 659.0, 555.0, 494.0, 91.0, 317.0, 222.0, 276.0, 232.0, 266.0, 985.0, 146.0, 713.0, 54.0, 592.0, 879.0, 799.0, 16.0, 273.0, 791.0, -47.0, 235.0, 306.0, 873.0, 829.0, 266.0, 672.0, 893.0, 396.0, 396.0, 809.0, 603.0, 307.0, 712.0, 551.0, 390.0, 551.0, 898.0, 777.0, 672.0, 385.0, 810.0, 39.0, 311.0, 868.0, 734.0, 554.0, 941.0, 414.0, 182.0, -549.0, 589.0, 24.0, 141.0, 332.0, 833.0, 76.0, 431.0, 528.0, 313.0, 690.0, 190.0, 860.0, 670.0, 991.0, 679.0, 0.0, 460.0, 122.0, 585.0, 647.0, 157.0, 260.0, 941.0, 943.0, 27.0, 820.0, 81.0, 612.0, 622.0, -252.0, 183.0, 673.0, 275.0, 27.0, 867.0, 36.0, 45.0, 80.0, 321.0, 373.0, 485.0, 232.0, 428.0, 379.0, 973.0, 532.0, 804.0, 763.0, 91.0, 802.0, 463.0, 190.0, 153.0, 910.0, 552.0, 885.0, 976.0, 84.0, 572.0, -83.0, 109.0, 349.0, 881.0, 368.0, 980.0, 316.0, 97.0, 654.0, 737.0, 652.0, 525.0, 714.0, 526.0, 608.0, 245.0, 296.0, 765.0, 222.0, 403.0, 419.0, 663.0, 256.0, 23.0, 144.0, 446.0, 905.0, 933.0, 238.0, 709.0, -17.0, 587.0, 508.0, 879.0, 525.0, 310.0, 486.0, 372.0, 742.0, 764.0, 462.0, 8.0, 108.0, 741.0, 803.0, 502.0, 422.0, 579.0, 993.0, 835.0, 953.0, 584.0, 92.0, 932.0, 579.0, 534.0, 602.0, 473.0, 99.0, 961.0, -413.0, 83.0, 255.0, 849.0, 953.0, 912.0, 552.0, 220.0, 968.0, 600.0, 918.0, 48.0, 863.0, 88.0, 132.0, 278.0, 957.0, 507.0, 320.0, 618.0, 768.0, 776.0, 401.0, 943.0, 995.0, 695.0, 869.0, 295.0, 960.0, 357.0, -446.0, 304.0, 926.0, 664.0, 785.0, 211.0, 287.0, 420.0, 187.0, 325.0, 49.0, 355.0, 607.0, 752.0, 783.0, 963.0, 952.0, 188.0, 939.0, 953.0, 384.0, 255.0, 674.0, 747.0, 243.0, 892.0, 60.0, 3.0, 643.0, 831.0, -734.0, 240.0, 944.0, 943.0, 382.0, 671.0, 12.0, 292.0, 18.0, 966.0, 296.0, 335.0, 206.0, 881.0, 22.0, 473.0, 888.0, 510.0, 725.0, 276.0, 769.0, 506.0, 202.0, 789.0, 515.0, 110.0, 658.0, 835.0, 345.0, 885.0, -776.0, 194.0, 8.0, 124.0, 908.0, 176.0, 144.0, 820.0, 803.0, 56.0, 270.0, 676.0, 741.0, 621.0, 290.0, 214.0, 511.0, 382.0, 431.0, 164.0, 173.0, 504.0, 116.0, 828.0, 68.0, 465.0, 263.0, 383.0, 793.0, 32.0, -313.0, 39.0, 34.0, 331.0, 874.0, 310.0, 607.0, 128.0, 505.0, 948.0, 827.0, 357.0, 31.0, 904.0, 165.0, 491.0, 809.0, 607.0, 390.0, 156.0, 441.0, 219.0, 198.0, 165.0, 592.0, 890.0, 192.0, 791.0, 234.0, 424.0, -886.0, 552.0, 965.0, 615.0, 370.0, 30.0, 79.0, 178.0, 67.0, 148.0, 20.0, 241.0, 928.0, 301.0, 73.0, 55.0, 428.0, 812.0, 752.0, 535.0, 110.0, 518.0, 584.0, 661.0, 35.0, 856.0, 279.0, 633.0, 354.0, 450.0, -327.0, 165.0, 508.0, 261.0, 763.0, 496.0, 415.0, 872.0, 557.0, 428.0, 110.0, 406.0, 341.0, 425.0, 326.0, 644.0, 904.0, 676.0, 542.0, 590.0, 738.0, 651.0, 980.0, 521.0, 865.0, 511.0, 920.0, 563.0, 448.0, 780.0, -920.0, 999.0, 28.0, 940.0, 120.0, 908.0, 167.0, 319.0, 891.0, 53.0, 158.0, 319.0, 202.0, 283.0, 793.0, 608.0, 320.0, 711.0, 447.0, 533.0, 528.0, 300.0, 532.0, 797.0, 571.0, 960.0, 104.0, 773.0, 122.0, 99.0, -156.0, 119.0, 932.0, 689.0, 227.0, 991.0, 396.0, 890.0, 579.0, 54.0, 459.0, 624.0, 976.0, 904.0, 781.0, 712.0, 403.0, 560.0, 226.0, 225.0, 940.0, 68.0, 140.0, 714.0, 937.0, 731.0, 624.0, 416.0, 601.0, 50.0, -883.0, 869.0, 921.0, 350.0, 226.0, 596.0, 698.0, 51.0, 510.0, 865.0, 942.0, 971.0, 794.0, 254.0, 757.0, 19.0, 690.0, 303.0, 664.0, 127.0, 538.0, 487.0, 609.0, 86.0, 688.0, 76.0, 477.0, 664.0, 343.0, 613.0 -}; -const double input2_data[DATA_SIZE*DATA_SIZE] = { -494.0, 929.0, 783.0, 86.0, 317.0, 957.0, 289.0, 481.0, 222.0, 945.0, 97.0, 952.0, 1.0, 834.0, 795.0, 59.0, 127.0, 10.0, 399.0, 904.0, 907.0, 665.0, 623.0, 841.0, 190.0, 903.0, 698.0, 132.0, 775.0, 911.0, -346.0, 417.0, 995.0, 439.0, 159.0, 390.0, 902.0, 917.0, 174.0, 462.0, 638.0, 484.0, 505.0, 353.0, 943.0, 724.0, 684.0, 845.0, 753.0, 983.0, 353.0, 730.0, 883.0, 679.0, 449.0, 442.0, 242.0, 117.0, 64.0, 143.0, -42.0, 476.0, 395.0, 343.0, 983.0, 724.0, 153.0, 3.0, 88.0, 131.0, 475.0, 517.0, 297.0, 917.0, 522.0, 776.0, 796.0, 468.0, 11.0, 9.0, 788.0, 908.0, 161.0, 554.0, 429.0, 381.0, 495.0, 720.0, 795.0, 563.0, -976.0, 786.0, 349.0, 930.0, 421.0, 218.0, 589.0, 210.0, 18.0, 536.0, 888.0, 18.0, 142.0, 674.0, 684.0, 297.0, 190.0, 160.0, 195.0, 232.0, 917.0, 451.0, 955.0, 340.0, 553.0, 470.0, 287.0, 263.0, 164.0, 28.0, -133.0, 206.0, 141.0, 697.0, 860.0, 45.0, 508.0, 61.0, 442.0, 142.0, 991.0, 523.0, 720.0, 544.0, 787.0, 231.0, 635.0, 828.0, 162.0, 150.0, 656.0, 915.0, 557.0, 56.0, 539.0, 108.0, 862.0, 160.0, 739.0, 561.0, -525.0, 564.0, 334.0, 977.0, 789.0, 287.0, 316.0, 169.0, 18.0, 140.0, 322.0, 707.0, 458.0, 646.0, 902.0, 728.0, 877.0, 386.0, 922.0, 657.0, 490.0, 942.0, 127.0, 617.0, 143.0, 979.0, 911.0, 71.0, 348.0, 989.0, -55.0, 968.0, 173.0, 987.0, 959.0, 865.0, 850.0, 194.0, 776.0, 681.0, 893.0, 564.0, 305.0, 645.0, 600.0, 214.0, 613.0, 790.0, 496.0, 127.0, 564.0, 197.0, 818.0, 419.0, 597.0, 414.0, 552.0, 240.0, 517.0, 34.0, -487.0, 497.0, 614.0, 182.0, 761.0, 45.0, 959.0, 441.0, 90.0, 500.0, 76.0, 435.0, 905.0, 920.0, 402.0, 641.0, 621.0, 526.0, 521.0, 438.0, 457.0, 224.0, 718.0, 611.0, 909.0, 347.0, 524.0, 158.0, 491.0, 331.0, -727.0, 536.0, 243.0, 503.0, 252.0, 633.0, 675.0, 671.0, 600.0, 598.0, 479.0, 522.0, 92.0, 778.0, 793.0, 956.0, 562.0, 659.0, 29.0, 132.0, 182.0, 491.0, 107.0, 891.0, 108.0, 664.0, 985.0, 608.0, 610.0, 589.0, -364.0, 665.0, 689.0, 262.0, 356.0, 324.0, 646.0, 275.0, 906.0, 316.0, 551.0, 836.0, 14.0, 495.0, 900.0, 859.0, 529.0, 900.0, 516.0, 654.0, 635.0, 90.0, 337.0, 59.0, 954.0, 419.0, 921.0, 903.0, 393.0, 503.0, -458.0, 879.0, 568.0, 925.0, 809.0, 496.0, 22.0, 780.0, 966.0, 265.0, 759.0, 872.0, 352.0, 110.0, 467.0, 641.0, 321.0, 272.0, 646.0, 765.0, 413.0, 935.0, 309.0, 661.0, 796.0, 881.0, 283.0, 612.0, 602.0, 684.0, -680.0, 332.0, 688.0, 896.0, 483.0, 131.0, 680.0, 265.0, 828.0, 779.0, 410.0, 176.0, 657.0, 776.0, 678.0, 232.0, 522.0, 722.0, 485.0, 878.0, 987.0, 754.0, 225.0, 897.0, 374.0, 704.0, 117.0, 929.0, 586.0, 891.0, -162.0, 237.0, 300.0, 147.0, 648.0, 324.0, 376.0, 552.0, 381.0, 589.0, 788.0, 637.0, 461.0, 802.0, 481.0, 694.0, 406.0, 127.0, 623.0, 743.0, 662.0, 257.0, 636.0, 655.0, 827.0, 649.0, 961.0, 561.0, 511.0, 991.0, -633.0, 907.0, 279.0, 732.0, 527.0, 31.0, 54.0, 320.0, 875.0, 863.0, 82.0, 881.0, 272.0, 677.0, 787.0, 149.0, 709.0, 457.0, 411.0, 287.0, 598.0, 27.0, 231.0, 853.0, 270.0, 520.0, 96.0, 178.0, 51.0, 938.0, -414.0, 639.0, 127.0, 888.0, 18.0, 821.0, 606.0, 253.0, 566.0, 134.0, 833.0, 324.0, 316.0, 393.0, 157.0, 753.0, 888.0, 131.0, 40.0, 450.0, 762.0, 139.0, 603.0, 196.0, 324.0, 590.0, 573.0, 738.0, 252.0, 297.0, -353.0, 280.0, 718.0, 350.0, 813.0, 663.0, 2.0, 405.0, 405.0, 353.0, 957.0, 699.0, 981.0, 499.0, 807.0, 648.0, 0.0, 675.0, 8.0, 514.0, 215.0, 22.0, 463.0, 671.0, 65.0, 764.0, 25.0, 232.0, 891.0, 152.0, -698.0, 426.0, 813.0, 87.0, 12.0, 246.0, 1.0, 834.0, 81.0, 695.0, 296.0, 935.0, 30.0, 699.0, 811.0, 881.0, 822.0, 259.0, 883.0, 935.0, 447.0, 973.0, 629.0, 58.0, 534.0, 745.0, 654.0, 344.0, 826.0, 568.0, -198.0, 814.0, 111.0, 912.0, 84.0, 179.0, 706.0, 700.0, 838.0, 455.0, 972.0, 216.0, 659.0, 210.0, 781.0, 842.0, 248.0, 465.0, 518.0, 37.0, 3.0, 803.0, 562.0, 983.0, 375.0, 34.0, 882.0, 586.0, 668.0, 228.0, -644.0, 632.0, 530.0, 729.0, 783.0, 543.0, 450.0, 151.0, 302.0, 148.0, 225.0, 794.0, 590.0, 212.0, 146.0, 191.0, 718.0, 621.0, 303.0, 343.0, 562.0, 715.0, 516.0, 409.0, 36.0, 486.0, 248.0, 416.0, 624.0, 865.0, -680.0, 460.0, 737.0, 813.0, 658.0, 396.0, 749.0, 132.0, 875.0, 73.0, 243.0, 874.0, 309.0, 5.0, 748.0, 526.0, 273.0, 634.0, 661.0, 382.0, 61.0, 728.0, 511.0, 960.0, 418.0, 454.0, 249.0, 725.0, 68.0, 306.0, -899.0, 127.0, 599.0, 587.0, 7.0, 236.0, 722.0, 531.0, 694.0, 403.0, 163.0, 483.0, 557.0, 10.0, 855.0, 494.0, 90.0, 186.0, 180.0, 335.0, 646.0, 74.0, 869.0, 751.0, 409.0, 777.0, 449.0, 310.0, 989.0, 624.0, -672.0, 27.0, 968.0, 211.0, 459.0, 70.0, 617.0, 431.0, 983.0, 291.0, 878.0, 599.0, 69.0, 873.0, 904.0, 337.0, 250.0, 120.0, 20.0, 389.0, 254.0, 714.0, 478.0, 62.0, 931.0, 992.0, 324.0, 537.0, 84.0, 333.0, -724.0, 874.0, 511.0, 407.0, 915.0, 566.0, 809.0, 905.0, 811.0, 291.0, 572.0, 374.0, 376.0, 988.0, 751.0, 881.0, 888.0, 169.0, 922.0, 616.0, 181.0, 203.0, 389.0, 622.0, 568.0, 204.0, 924.0, 543.0, 331.0, 765.0, -273.0, 742.0, 682.0, 913.0, 38.0, 787.0, 983.0, 783.0, 821.0, 848.0, 114.0, 661.0, 678.0, 634.0, 865.0, 551.0, 700.0, 253.0, 481.0, 231.0, 860.0, 680.0, 801.0, 783.0, 987.0, 624.0, 888.0, 770.0, 11.0, 689.0, -731.0, 72.0, 538.0, 78.0, 324.0, 368.0, 176.0, 305.0, 817.0, 136.0, 122.0, 783.0, 608.0, 391.0, 136.0, 859.0, 13.0, 452.0, 663.0, 826.0, 880.0, 728.0, 947.0, 342.0, 646.0, 182.0, 13.0, 148.0, 450.0, 610.0, -477.0, 121.0, 487.0, 143.0, 125.0, 153.0, 268.0, 635.0, 745.0, 792.0, 981.0, 109.0, 694.0, 353.0, 547.0, 559.0, 974.0, 59.0, 655.0, 694.0, 165.0, 628.0, 544.0, 694.0, 476.0, 702.0, 494.0, 627.0, 88.0, 630.0, -194.0, 650.0, 165.0, 587.0, 884.0, 53.0, 44.0, 931.0, 432.0, 630.0, 214.0, 817.0, 383.0, 3.0, 908.0, 667.0, 205.0, 791.0, 845.0, 584.0, 396.0, 26.0, 547.0, 399.0, 189.0, 438.0, 698.0, 826.0, 734.0, 322.0, -678.0, 305.0, 868.0, 564.0, 480.0, 499.0, 182.0, 215.0, 276.0, 272.0, 803.0, 21.0, 458.0, 2.0, 90.0, 213.0, 223.0, 97.0, 68.0, 589.0, 408.0, 197.0, 472.0, 915.0, 771.0, 830.0, 907.0, 133.0, 146.0, 848.0, -89.0, 17.0, 580.0, 755.0, 909.0, 310.0, 249.0, 606.0, 544.0, 193.0, 409.0, 235.0, 537.0, 580.0, 850.0, 838.0, 882.0, 416.0, 365.0, 263.0, 331.0, 868.0, 68.0, 519.0, 206.0, 70.0, 530.0, 927.0, 590.0, 29.0, -990.0, 507.0, 164.0, 89.0, 550.0, 150.0, 959.0, 787.0, 444.0, 695.0, 79.0, 86.0, 359.0, 841.0, 991.0, 71.0, 537.0, 521.0, 253.0, 88.0, 872.0, 760.0, 71.0, 959.0, 844.0, 24.0, 323.0, 86.0, 284.0, 324.0 -}; -const double verify_data[DATA_SIZE*DATA_SIZE] = { -6411903.0, 7533490.0, 6901412.0, 8515176.0, 7937886.0, 5660376.0, 6684238.0, 5930516.0, 7411434.0, 5963715.0, 7339541.0, 7325224.0, 5773363.0, 6987016.0, 9472107.0, 7333522.0, 6868741.0, 6373927.0, 5818371.0, 6232365.0, 7410967.0, 7456844.0, 7008217.0, 8317827.0, 6560681.0, 6993670.0, 6792972.0, 6664526.0, 6449148.0, 6153016.0, -7767492.0, 6619373.0, 8084203.0, 7609675.0, 7767288.0, 5081659.0, 6915392.0, 6473820.0, 7434817.0, 5832034.0, 7317348.0, 8206514.0, 6552649.0, 7813213.0, 1.0197744E7, 8753506.0, 7589899.0, 6153453.0, 6716065.0, 7097060.0, 7488858.0, 8435675.0, 7671781.0, 8759777.0, 7413370.0, 7802106.0, 7613653.0, 6343832.0, 6741596.0, 7885307.0, -7584476.0, 6627809.0, 7334380.0, 7777143.0, 7311439.0, 4673418.0, 7231791.0, 6792764.0, 8133419.0, 6543855.0, 7278453.0, 7372708.0, 6345899.0, 7821711.0, 1.0014052E7, 8297702.0, 7222827.0, 6550430.0, 5927298.0, 5988751.0, 7626028.0, 8765186.0, 7449916.0, 8719206.0, 7751464.0, 6820384.0, 7786274.0, 7199576.0, 6446456.0, 7164579.0, -7690319.0, 8107202.0, 7462090.0, 8353962.0, 8807718.0, 5963658.0, 8163425.0, 6652851.0, 8169066.0, 6906018.0, 7448498.0, 8403353.0, 6499220.0, 9827668.0, 1.0370807E7, 8413234.0, 8979741.0, 6743719.0, 6583108.0, 6733404.0, 9203350.0, 8305284.0, 7661872.0, 8475391.0, 8638725.0, 7679995.0, 8961802.0, 6750314.0, 7049775.0, 8577580.0, -7412324.0, 8652125.0, 7961811.0, 9598288.0, 8296849.0, 6391772.0, 7240482.0, 7531594.0, 8110724.0, 7533868.0, 8155571.0, 8596878.0, 6925515.0, 7915918.0, 1.0572044E7, 8603488.0, 7693006.0, 7286678.0, 6399408.0, 6860014.0, 8027549.0, 7922416.0, 8053228.0, 9735395.0, 7584369.0, 8588126.0, 8653757.0, 7169521.0, 7682896.0, 8137235.0, -7988409.0, 7238778.0, 8449952.0, 7992084.0, 7262269.0, 5011760.0, 6432958.0, 7422948.0, 8648278.0, 7337408.0, 8462957.0, 7677222.0, 6397788.0, 7568297.0, 1.0797395E7, 7705063.0, 7146568.0, 6022723.0, 6370298.0, 7586299.0, 7452484.0, 7887693.0, 7307727.0, 9197385.0, 7726968.0, 8478492.0, 7601569.0, 6960883.0, 6522251.0, 7976344.0, -7572166.0, 7897218.0, 7706123.0, 7359858.0, 6988842.0, 5975477.0, 7234281.0, 8027665.0, 7471601.0, 7160694.0, 7249166.0, 8618544.0, 6644699.0, 8171915.0, 1.0497206E7, 8729802.0, 7174778.0, 5972218.0, 7216469.0, 7843533.0, 7628960.0, 7931021.0, 7962767.0, 9166492.0, 7547189.0, 8369468.0, 8488684.0, 5609773.0, 7562941.0, 8204473.0, -8857954.0, 8551958.0, 8909718.0, 9431684.0, 9556061.0, 6617423.0, 7827032.0, 8179745.0, 9353072.0, 7550842.0, 8121182.0, 8962388.0, 7937096.0, 8954704.0, 1.1085794E7, 9549551.0, 9145331.0, 7163225.0, 7988163.0, 8211966.0, 9352371.0, 9330003.0, 8532410.0, 1.0197366E7, 8931427.0, 8915845.0, 8823015.0, 7853048.0, 8044845.0, 9386048.0, -8545963.0, 7912549.0, 8606518.0, 8379764.0, 8351466.0, 6814749.0, 8057369.0, 8048780.0, 8451865.0, 7466527.0, 8749940.0, 8919463.0, 7276916.0, 8536482.0, 1.0911699E7, 9197448.0, 8560655.0, 7064115.0, 7860209.0, 8511883.0, 8654584.0, 9323289.0, 8981720.0, 9803015.0, 8598113.0, 8780796.0, 9135261.0, 6665333.0, 7324434.0, 8872122.0, -5946430.0, 6135788.0, 6591364.0, 6433251.0, 6632373.0, 4985963.0, 5769881.0, 4843727.0, 7021916.0, 5697194.0, 6385378.0, 6994046.0, 5502157.0, 7123974.0, 8365161.0, 6212258.0, 6104723.0, 5313031.0, 4845223.0, 5646055.0, 6129753.0, 6289476.0, 6051385.0, 7494259.0, 5723628.0, 6965055.0, 6172636.0, 5253106.0, 5610430.0, 6685980.0, -8055844.0, 8469174.0, 8170470.0, 8793113.0, 9859241.0, 7007325.0, 8137618.0, 8029717.0, 8997562.0, 7852580.0, 7740885.0, 9375326.0, 7883965.0, 1.0029182E7, 1.1346371E7, 9215599.0, 8636081.0, 7104102.0, 7496227.0, 8176023.0, 9732692.0, 8508678.0, 8257884.0, 1.0379885E7, 8448925.0, 9060668.0, 8836821.0, 7441640.0, 8726147.0, 9513821.0, -7367171.0, 7678838.0, 7804927.0, 8158615.0, 6673882.0, 6127379.0, 7575878.0, 6722516.0, 8300529.0, 6711008.0, 8389597.0, 8171405.0, 6196454.0, 7704372.0, 1.029698E7, 8112392.0, 7236981.0, 6508313.0, 6457042.0, 7280014.0, 7526900.0, 8231928.0, 7990796.0, 8184970.0, 7283106.0, 8214745.0, 8201658.0, 6708897.0, 6728783.0, 7449993.0, -7865362.0, 6776674.0, 8132061.0, 9041300.0, 7066772.0, 5445850.0, 7151522.0, 6942482.0, 7679044.0, 6487802.0, 8458847.0, 7314760.0, 6955102.0, 7314651.0, 9998347.0, 8897441.0, 7785211.0, 6248190.0, 6094040.0, 7118063.0, 7145775.0, 8373612.0, 7778499.0, 9104195.0, 6890059.0, 8603542.0, 8143136.0, 7583203.0, 6981314.0, 7304046.0, -7986730.0, 8495508.0, 8226132.0, 8837284.0, 8110164.0, 6270243.0, 7155084.0, 8144444.0, 7505249.0, 6776566.0, 8180893.0, 8341901.0, 6720764.0, 7941827.0, 1.0232075E7, 9099507.0, 8475362.0, 6245408.0, 7407245.0, 7953062.0, 7429359.0, 8415664.0, 8058703.0, 9250511.0, 7408836.0, 8633240.0, 8794298.0, 6612461.0, 6871055.0, 8223063.0, -7048151.0, 7129340.0, 7082981.0, 7489318.0, 8235538.0, 4945598.0, 7158417.0, 7404589.0, 7952446.0, 6524416.0, 7264108.0, 7881673.0, 6788833.0, 7675645.0, 1.0238341E7, 8484876.0, 7867557.0, 7037936.0, 7237519.0, 7346638.0, 7710289.0, 7883823.0, 7209172.0, 8844511.0, 7452039.0, 7340367.0, 7433870.0, 6571140.0, 7229344.0, 7904304.0, -8240448.0, 7954564.0, 8154706.0, 9504953.0, 8400499.0, 5460298.0, 7584126.0, 6871697.0, 9185194.0, 6905207.0, 9141475.0, 8494493.0, 7418418.0, 7753101.0, 1.0000234E7, 8633021.0, 7846515.0, 7199308.0, 6775263.0, 7435406.0, 8253684.0, 8758868.0, 8831361.0, 9095568.0, 8526076.0, 8528570.0, 8182811.0, 7299290.0, 7191919.0, 8226562.0, -6751777.0, 7186625.0, 6646412.0, 8018233.0, 6909389.0, 5658036.0, 6840978.0, 7190409.0, 7959774.0, 6192193.0, 6295944.0, 8365353.0, 6495593.0, 6989066.0, 9678769.0, 8422620.0, 6708233.0, 6122942.0, 6541236.0, 6976638.0, 7715128.0, 7027844.0, 7287427.0, 8765201.0, 6824683.0, 7411205.0, 7587931.0, 6447872.0, 6342510.0, 7582018.0, -7082585.0, 6197492.0, 6807726.0, 6956058.0, 6641216.0, 4830697.0, 5048238.0, 6053977.0, 6808066.0, 5552500.0, 6709804.0, 7175350.0, 6209976.0, 5950372.0, 8477306.0, 7412364.0, 5897445.0, 5540244.0, 5666371.0, 6696737.0, 6961106.0, 6862383.0, 6637593.0, 8019798.0, 5997948.0, 7597047.0, 6638311.0, 5646733.0, 6753680.0, 7600719.0, -7405626.0, 7342316.0, 6412193.0, 8347922.0, 7388131.0, 4709450.0, 6484686.0, 6422277.0, 7614878.0, 6155351.0, 8101524.0, 7056940.0, 5444306.0, 7010298.0, 9663947.0, 7786125.0, 6890705.0, 6006121.0, 6032720.0, 6508389.0, 7243995.0, 7137452.0, 6513236.0, 8338516.0, 7172679.0, 7934665.0, 8344129.0, 6707389.0, 6191989.0, 7659162.0, -8824351.0, 8499764.0, 7935349.0, 9362467.0, 7540129.0, 6218015.0, 8666871.0, 7431305.0, 9119473.0, 7093927.0, 8066413.0, 8889088.0, 6865107.0, 7703744.0, 1.1009442E7, 8749305.0, 8122317.0, 7493355.0, 6353037.0, 6661887.0, 8703986.0, 8305822.0, 8770358.0, 9415405.0, 8359360.0, 8392544.0, 8323587.0, 7316703.0, 6900664.0, 7998790.0, -8559320.0, 8232032.0, 8582628.0, 9561300.0, 8909146.0, 6399353.0, 7974575.0, 8670453.0, 9676934.0, 7602031.0, 9078286.0, 1.0045779E7, 7111219.0, 8996593.0, 1.226795E7, 1.0495147E7, 8627020.0, 7247946.0, 8214785.0, 8248879.0, 8942703.0, 1.0207892E7, 9159335.0, 9386478.0, 9122587.0, 9412550.0, 1.0407378E7, 8320858.0, 8412198.0, 9183005.0, -7864178.0, 7731246.0, 7874580.0, 8126602.0, 8217140.0, 6306880.0, 7598202.0, 6913447.0, 7964758.0, 7035041.0, 7879116.0, 8569671.0, 7028566.0, 9038832.0, 1.0641974E7, 8450936.0, 9082649.0, 6675630.0, 6481241.0, 6939335.0, 8501496.0, 8693928.0, 7876616.0, 8953343.0, 7470078.0, 7817750.0, 7845736.0, 7283306.0, 7144396.0, 8159051.0, -8244761.0, 8043437.0, 8160617.0, 7960796.0, 7638538.0, 5521627.0, 6770600.0, 6827077.0, 7424432.0, 7085102.0, 6703036.0, 8663556.0, 6008305.0, 8127122.0, 1.0757715E7, 7838981.0, 7320679.0, 6470974.0, 6395050.0, 7078761.0, 9051568.0, 8296696.0, 7669203.0, 8793438.0, 7935840.0, 8242858.0, 8167852.0, 6513437.0, 7222152.0, 8621155.0, -5398601.0, 5656258.0, 6127160.0, 6431306.0, 5861315.0, 4330490.0, 5807953.0, 5755300.0, 6320034.0, 6114599.0, 5885148.0, 6497254.0, 5450734.0, 6993714.0, 8061082.0, 6229394.0, 6511621.0, 4914814.0, 4902640.0, 5650877.0, 6261491.0, 6670406.0, 5878213.0, 7134964.0, 5872648.0, 6551681.0, 7116052.0, 5637658.0, 5610163.0, 6938664.0, -6665479.0, 6655902.0, 6361533.0, 6987969.0, 6216951.0, 4491075.0, 5339732.0, 5808102.0, 7720459.0, 6041092.0, 7229382.0, 7153949.0, 5369655.0, 6203304.0, 8545499.0, 6785740.0, 6520276.0, 5817279.0, 5606822.0, 6256946.0, 6558416.0, 6783016.0, 6327605.0, 7218378.0, 6631328.0, 6923787.0, 7005653.0, 5461050.0, 6094680.0, 7234228.0, -5843630.0, 6349173.0, 6588290.0, 6096008.0, 6187686.0, 4971317.0, 6130619.0, 5956329.0, 6069945.0, 6017854.0, 6689679.0, 6237127.0, 5164682.0, 6970236.0, 8278739.0, 6412665.0, 6644780.0, 4531485.0, 5404887.0, 5762443.0, 6496410.0, 7358530.0, 6524107.0, 7809916.0, 6432073.0, 6378435.0, 7274185.0, 5883336.0, 5290775.0, 7048857.0, -8593925.0, 7961767.0, 8292016.0, 8257014.0, 8765041.0, 5832016.0, 8184239.0, 8118105.0, 8933460.0, 7297129.0, 7909218.0, 9048344.0, 7497893.0, 8965430.0, 1.135426E7, 9530131.0, 8401874.0, 7390341.0, 7549933.0, 7675511.0, 8338916.0, 8618154.0, 8505674.0, 9483324.0, 8414926.0, 8193850.0, 9268065.0, 7355728.0, 8122626.0, 8739657.0, -7908996.0, 7506755.0, 7778220.0, 8175639.0, 5766252.0, 6284412.0, 7460913.0, 6975311.0, 7164714.0, 6767688.0, 7763452.0, 7215835.0, 6324251.0, 7271730.0, 9511219.0, 8140001.0, 7262476.0, 5372931.0, 6328202.0, 7443961.0, 7220668.0, 7721429.0, 8144112.0, 9294316.0, 6481768.0, 8506164.0, 8022424.0, 5875671.0, 5724007.0, 7793814.0, -7633150.0, 7688795.0, 7394176.0, 9147539.0, 8138372.0, 6018889.0, 6933383.0, 7053057.0, 8134616.0, 7254464.0, 7889703.0, 8526882.0, 7741309.0, 8523760.0, 1.0130337E7, 9570375.0, 8261785.0, 6437251.0, 6961202.0, 7470969.0, 8885321.0, 7952766.0, 8360734.0, 9775719.0, 7554078.0, 8847637.0, 8491061.0, 7347352.0, 7755601.0, 8960722.0, -7960421.0, 8470588.0, 8332189.0, 8444058.0, 8235295.0, 6839043.0, 7517756.0, 7415098.0, 8360872.0, 7000654.0, 8095598.0, 8937832.0, 5813282.0, 8713683.0, 1.0340755E7, 8893432.0, 8245456.0, 6941267.0, 7209275.0, 8630017.0, 9331110.0, 8897494.0, 7845643.0, 9026386.0, 8115773.0, 8895582.0, 8748333.0, 7461803.0, 7814024.0, 9201676.0 -}; diff --git a/benchmarks/dgemm/dgemm_gendata.scala b/benchmarks/dgemm/dgemm_gendata.scala deleted file mode 100644 index de2740b..0000000 --- a/benchmarks/dgemm/dgemm_gendata.scala +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env scala -!# - -val size = args(0).toInt - -def print_matrix(name: String, rows: Int, cols: Int, data: Array[Double]) = { - println("const double " + name + "[DATA_SIZE*DATA_SIZE] = {") - for (i <- 0 until rows) { - println(data.slice(cols*i, cols*(i+1)).mkString(", ") + (if (i < rows-1) ", " else "")) - } - println("};") -} -def rand_matrix(rows: Int, cols: Int) = { - var r = new scala.util.Random - var m = new Array[Double](rows*cols) - for (i <- 0 until rows*cols) - m(i) = r.nextInt(1000) - m -} -def matmul(a: Array[Double], b: Array[Double], m: Int, n: Int, k: Int) = { - var c = new Array[Double](m*n) - for (i <- 0 until m) - for (j <- 0 until n) - for (l <- 0 until k) - c(i*n+j) += a(i*n+l)*b(l*k+j) - c -} - -println("#define DATA_SIZE " + size) - -val a = rand_matrix(size, size) -val b = rand_matrix(size, size) -val c = matmul(a, b, size, size, size) - -print_matrix("input1_data", size, size, a) -print_matrix("input2_data", size, size, b) -print_matrix("verify_data", size, size, c) diff --git a/benchmarks/dgemm/dgemm_main.c b/benchmarks/dgemm/dgemm_main.c deleted file mode 100644 index 9f28c07..0000000 --- a/benchmarks/dgemm/dgemm_main.c +++ /dev/null @@ -1,122 +0,0 @@ -//************************************************************************** -// Double-precision general matrix multiplication benchmark -//-------------------------------------------------------------------------- - -#include "util.h" - -//-------------------------------------------------------------------------- -// Input/Reference Data - -#include "dataset1.h" - -//-------------------------------------------------------------------------- -// square_dgemm function - -void square_dgemm( long n0, const double a0[], const double b0[], double c0[] ) -{ - long n = (n0+2)/3*3; - double a[n*n], b[n*n], c[n*n]; - - for (long i = 0; i < n0; i++) - { - long j; - for (j = 0; j < n0; j++) - { - a[i*n+j] = a0[i*n0+j]; - b[i*n+j] = b0[j*n0+i]; - } - for ( ; j < n; j++) - { - a[i*n+j] = b[i*n+j] = 0; - } - } - for (long i = n0; i < n; i++) - for (long j = 0; j < n; j++) - a[i*n+j] = b[i*n+j] = 0; - - long i, j, k; - for (i = 0; i < n; i+=3) - { - for (j = 0; j < n; j+=3) - { - double *a0 = a + (i+0)*n, *b0 = b + (j+0)*n; - double *a1 = a + (i+1)*n, *b1 = b + (j+1)*n; - double *a2 = a + (i+2)*n, *b2 = b + (j+2)*n; - - double s00 = 0, s01 = 0, s02 = 0; - double s10 = 0, s11 = 0, s12 = 0; - double s20 = 0, s21 = 0, s22 = 0; - - while (a0 < a + (i+1)*n) - { - double a00 = a0[0], a01 = a0[1], a02 = a0[2]; - double b00 = b0[0], b01 = b0[1], b02 = b0[2]; - double a10 = a1[0], a11 = a1[1], a12 = a1[2]; - double b10 = b1[0], b11 = b1[1], b12 = b1[2]; - asm ("" ::: "memory"); - double a20 = a2[0], a21 = a2[1], a22 = a2[2]; - double b20 = b2[0], b21 = b2[1], b22 = b2[2]; - - s00 = a00*b00 + (a01*b01 + (a02*b02 + s00)); - s01 = a00*b10 + (a01*b11 + (a02*b12 + s01)); - s02 = a00*b20 + (a01*b21 + (a02*b22 + s02)); - s10 = a10*b00 + (a11*b01 + (a12*b02 + s10)); - s11 = a10*b10 + (a11*b11 + (a12*b12 + s11)); - s12 = a10*b20 + (a11*b21 + (a12*b22 + s12)); - s20 = a20*b00 + (a21*b01 + (a22*b02 + s20)); - s21 = a20*b10 + (a21*b11 + (a22*b12 + s21)); - s22 = a20*b20 + (a21*b21 + (a22*b22 + s22)); - - a0 += 3; b0 += 3; - a1 += 3; b1 += 3; - a2 += 3; b2 += 3; - } - - c[(i+0)*n+j+0] = s00; c[(i+0)*n+j+1] = s01; c[(i+0)*n+j+2] = s02; - c[(i+1)*n+j+0] = s10; c[(i+1)*n+j+1] = s11; c[(i+1)*n+j+2] = s12; - c[(i+2)*n+j+0] = s20; c[(i+2)*n+j+1] = s21; c[(i+2)*n+j+2] = s22; - } - } - - for (long i = 0; i < n0; i++) - { - long j; - for (j = 0; j < n0 - 2; j+=3) - { - c0[i*n0+j+0] = c[i*n+j+0]; - c0[i*n0+j+1] = c[i*n+j+1]; - c0[i*n0+j+2] = c[i*n+j+2]; - } - for ( ; j < n0; j++) - c0[i*n0+j] = c[i*n+j]; - } -} - -//-------------------------------------------------------------------------- -// Main - -int main( int argc, char* argv[] ) -{ - double results_data[DATA_SIZE*DATA_SIZE]; - - // Output the input array - printDoubleArray( "input1", DATA_SIZE*DATA_SIZE, input1_data ); - printDoubleArray( "input2", DATA_SIZE*DATA_SIZE, input2_data ); - printDoubleArray( "verify", DATA_SIZE*DATA_SIZE, verify_data ); - -#if PREALLOCATE - // If needed we preallocate everything in the caches - square_dgemm( DATA_SIZE, input1_data, input2_data, results_data ); -#endif - - // Do the dgemm - setStats(1); - square_dgemm( DATA_SIZE, input1_data, input2_data, results_data ); - setStats(0); - - // Print out the results - printDoubleArray( "results", DATA_SIZE*DATA_SIZE, results_data ); - - // Check the results - return verifyDouble( DATA_SIZE*DATA_SIZE, results_data, verify_data ); -} diff --git a/benchmarks/mm/bmark.mk b/benchmarks/mm/bmark.mk new file mode 100644 index 0000000..bb982a8 --- /dev/null +++ b/benchmarks/mm/bmark.mk @@ -0,0 +1,32 @@ +#======================================================================= +# UCB CS250 Makefile fragment for benchmarks +#----------------------------------------------------------------------- +# +# Each benchmark directory should have its own fragment which +# essentially lists what the source files are and how to link them +# into an riscv and/or host executable. All variables should include +# the benchmark name as a prefix so that they are unique. +# + +mm_c_src = \ + mm_main.c \ + mm.c \ + syscalls.c \ + +mm_riscv_src = \ + hwacha.S \ + crt.S \ + +mm_c_objs = $(patsubst %.c, %.o, $(mm_c_src)) +mm_riscv_objs = $(patsubst %.S, %.o, $(mm_riscv_src)) + +mm_host_bin = mm.host +$(mm_host_bin) : $(mm_c_src) + $(HOST_COMP) $^ -o $(mm_host_bin) + +mm_riscv_bin = mm.riscv +$(mm_riscv_bin) : $(mm_c_objs) $(mm_riscv_objs) + $(RISCV_LINK) $(mm_c_objs) $(mm_riscv_objs) -o $(mm_riscv_bin) $(RISCV_LINK_OPTS) + +junk += $(mm_c_objs) $(mm_riscv_objs) \ + $(mm_host_bin) $(mm_riscv_bin) diff --git a/benchmarks/mm/common.h b/benchmarks/mm/common.h new file mode 100644 index 0000000..f0e6709 --- /dev/null +++ b/benchmarks/mm/common.h @@ -0,0 +1,35 @@ +#ifndef _MM_H +#define _MM_H + +#include +#include +#include + +#ifdef SP +typedef float t; +#define fma fmaf +#else +typedef double t; +#endif + +#define inline inline __attribute__((always_inline)) + +#define alloca_aligned(s, a) ((void*)(((uintptr_t)alloca((s)+(a)-1)+(a)-1)&~((a)-1))) + +#include "rb.h" +#include "hwacha.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void mm(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc); + +#ifdef __cplusplus +} +#endif + +//void rb(t* a, t* b, t* c, size_t lda, size_t ldb, size_t ldc); + +#endif diff --git a/benchmarks/mm/gen.scala b/benchmarks/mm/gen.scala new file mode 100644 index 0000000..2d3dc34 --- /dev/null +++ b/benchmarks/mm/gen.scala @@ -0,0 +1,81 @@ +import scala.sys.process._ +object MMGen { + implicit def i2s(i: Int) = i.toString + def writeFile(name: String, contents: String) = { + val f = new java.io.FileWriter(name) + f.write(contents) + f.close + } + + var indent = 0 + def spacing = " " * indent + def assign(lhs: String, rhs: String) = + spacing + lhs + " = " + rhs + ";\n" + def init(t: String, n: String, v: String) = + assign(t+" "+n, v) + def open_block(s: String = "") = { + val result = (if (s != "") spacing + s else "") + spacing + "{\n" + indent = indent + 1 + result + } + def close_block = { + indent = indent - 1 + spacing + "}\n" + } + + def ar(m: String, i: String) = m+"["+i+"]" + def r(a: String, b: String*) = (a :: b.toList).reduceLeft(_+"_"+_) + + def rb(m: Int, n: Int, p: Int) = { + var s = open_block("static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)\n") + + for (i <- 0 until m) + s += init("t*", r("c", i), "&"+ar("c", "ldc*"+i)) + for (i <- 0 until m; j <- 0 until n) + s += init("t", r("c", i, j), ar(r("c", i), j)) + + def doit(m: Int, n: Int, p: Int) = { + for (i <- 0 until m) + s += init("t*", r("a", i), "&"+ar("a", "lda*"+i)) + for (k <- 0 until p) + s += init("t*", r("b", k), "&"+ar("b", "ldb*"+k)) + for (i <- 0 until m; j <- 0 until n; k <- 0 until p) + s += assign(r("c", i, j), "fma(" + ar(r("a", i), k) + ", " + ar(r("b", k), j) + ", " + r("c", i, j) + ")") + } + + s += open_block("for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)\n") + doit(m, n, p) + s += close_block + + s += open_block("for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)\n") + doit(m, n, 1) + s += close_block + + for (i <- 0 until m; j <- 0 until n) + s += assign(ar(r("c", i), j), r("c", i, j)) + s += close_block + + s + } + def gcd(a: Int, b: Int): Int = if (b == 0) a else gcd(b, a%b) + def lcm(a: Int, b: Int): Int = a*b/gcd(a, b) + def lcm(a: Seq[Int]): Int = { + if (a.tail.isEmpty) a.head + else lcm(a.head, lcm(a.tail)) + } + def test1(m: Int, n: Int, p: Int, m1: Int, n1: Int, p1: Int) = { + val decl = "static const int RBM = "+m+", RBN = "+n+", RBK = "+p+";\n" + + "static const int CBM = "+m1+", CBN = "+n1+", CBK = "+p1+";\n" + writeFile("rb.h", decl + rb(m, n, p)) + //"make"!! + + "make run"! + + ("cp a.out " + Seq("b", m, n, p, m1, n1, p1, "run").reduce(_+"."+_))! + } + def main(args: Array[String]): Unit = { + test1(4, 5, 6, 36, 35, 36) + //for (i <- 4 to 6; j <- 4 to 6; k <- 4 to 6) + // test1(i, j, k, if (i == 5) 35 else 36, if (j == 5) 35 else 36, if (k == 5) 35 else 36) + } +} diff --git a/benchmarks/mm/hwacha.S b/benchmarks/mm/hwacha.S new file mode 100644 index 0000000..2a07f63 --- /dev/null +++ b/benchmarks/mm/hwacha.S @@ -0,0 +1,92 @@ + .text + .align 2 + + .globl hwacha_mm_0 +hwacha_mm_0: + fmadd.d f0,f19,f20,f0 + stop + + .globl hwacha_mm_1 +hwacha_mm_1: + fmadd.d f1,f18,f20,f1 + stop + + .globl hwacha_mm_2 +hwacha_mm_2: + fmadd.d f2,f19,f20,f2 + stop + + .globl hwacha_mm_3 +hwacha_mm_3: + fmadd.d f3,f18,f20,f3 + stop + + .globl hwacha_mm_4 +hwacha_mm_4: + fmadd.d f4,f19,f20,f4 + stop + + .globl hwacha_mm_5 +hwacha_mm_5: + fmadd.d f5,f18,f20,f5 + stop + + .globl hwacha_mm_6 +hwacha_mm_6: + fmadd.d f6,f19,f20,f6 + stop + + .globl hwacha_mm_7 +hwacha_mm_7: + fmadd.d f7,f18,f20,f7 + stop + + .globl hwacha_mm_8 +hwacha_mm_8: + fmadd.d f8,f19,f20,f8 + stop + + .globl hwacha_mm_9 +hwacha_mm_9: + fmadd.d f9,f18,f20,f9 + stop + + .globl hwacha_mm_10 +hwacha_mm_10: + fmadd.d f10,f19,f20,f10 + stop + + .globl hwacha_mm_11 +hwacha_mm_11: + fmadd.d f11,f18,f20,f11 + stop + + .globl hwacha_mm_12 +hwacha_mm_12: + fmadd.d f12,f19,f20,f12 + stop + + .globl hwacha_mm_13 +hwacha_mm_13: + fmadd.d f13,f18,f20,f13 + stop + + .globl hwacha_mm_14 +hwacha_mm_14: + fmadd.d f14,f19,f20,f14 + stop + + .globl hwacha_mm_15 +hwacha_mm_15: + fmadd.d f15,f18,f20,f15 + stop + + .globl hwacha_mm_16 +hwacha_mm_16: + fmadd.d f16,f19,f20,f16 + stop + + .globl hwacha_mm_17 +hwacha_mm_17: + fmadd.d f17,f18,f20,f17 + stop diff --git a/benchmarks/mm/hwacha.h b/benchmarks/mm/hwacha.h new file mode 100644 index 0000000..c12d854 --- /dev/null +++ b/benchmarks/mm/hwacha.h @@ -0,0 +1,138 @@ +static const int HCBM = 18; +static const int HCBN = 80; +static const int HCBK = 16; + +static const int HRBM = 18; +static const int HRBN = 80; +static const int HRBK = 1; + +extern void hwacha_mm_0(); +extern void hwacha_mm_1(); +extern void hwacha_mm_2(); +extern void hwacha_mm_3(); +extern void hwacha_mm_4(); +extern void hwacha_mm_5(); +extern void hwacha_mm_6(); +extern void hwacha_mm_7(); +extern void hwacha_mm_8(); +extern void hwacha_mm_9(); +extern void hwacha_mm_10(); +extern void hwacha_mm_11(); +extern void hwacha_mm_12(); +extern void hwacha_mm_13(); +extern void hwacha_mm_14(); +extern void hwacha_mm_15(); +extern void hwacha_mm_16(); +extern void hwacha_mm_17(); + +static inline void nloop(int s, int e, t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[0*lda])); + asm volatile ("vfld vf20, %[ptr]" : : [ptr]"r"(b) : ); + if (s) asm volatile ("vfld vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_0) : ); + if (e) asm volatile ("vfsd vf0, %[ptr]" : : [ptr]"r"(&c[ldc*0]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[1*lda])); + if (s) asm volatile ("vfld vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_1) : ); + if (e) asm volatile ("vfsd vf1, %[ptr]" : : [ptr]"r"(&c[ldc*1]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[2*lda])); + if (s) asm volatile ("vfld vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_2) : ); + if (e) asm volatile ("vfsd vf2, %[ptr]" : : [ptr]"r"(&c[ldc*2]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[3*lda])); + if (s) asm volatile ("vfld vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_3) : ); + if (e) asm volatile ("vfsd vf3, %[ptr]" : : [ptr]"r"(&c[ldc*3]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[4*lda])); + if (s) asm volatile ("vfld vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_4) : ); + if (e) asm volatile ("vfsd vf4, %[ptr]" : : [ptr]"r"(&c[ldc*4]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[5*lda])); + if (s) asm volatile ("vfld vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_5) : ); + if (e) asm volatile ("vfsd vf5, %[ptr]" : : [ptr]"r"(&c[ldc*5]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[6*lda])); + if (s) asm volatile ("vfld vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_6) : ); + if (e) asm volatile ("vfsd vf6, %[ptr]" : : [ptr]"r"(&c[ldc*6]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[7*lda])); + if (s) asm volatile ("vfld vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_7) : ); + if (e) asm volatile ("vfsd vf7, %[ptr]" : : [ptr]"r"(&c[ldc*7]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[8*lda])); + if (s) asm volatile ("vfld vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_8) : ); + if (e) asm volatile ("vfsd vf8, %[ptr]" : : [ptr]"r"(&c[ldc*8]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[9*lda])); + if (s) asm volatile ("vfld vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_9) : ); + if (e) asm volatile ("vfsd vf9, %[ptr]" : : [ptr]"r"(&c[ldc*9]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[10*lda])); + if (s) asm volatile ("vfld vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_10) : ); + if (e) asm volatile ("vfsd vf10, %[ptr]" : : [ptr]"r"(&c[ldc*10]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[11*lda])); + if (s) asm volatile ("vfld vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_11) : ); + if (e) asm volatile ("vfsd vf11, %[ptr]" : : [ptr]"r"(&c[ldc*11]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[12*lda])); + if (s) asm volatile ("vfld vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_12) : ); + if (e) asm volatile ("vfsd vf12, %[ptr]" : : [ptr]"r"(&c[ldc*12]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[13*lda])); + if (s) asm volatile ("vfld vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_13) : ); + if (e) asm volatile ("vfsd vf13, %[ptr]" : : [ptr]"r"(&c[ldc*13]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[14*lda])); + if (s) asm volatile ("vfld vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_14) : ); + if (e) asm volatile ("vfsd vf14, %[ptr]" : : [ptr]"r"(&c[ldc*14]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[15*lda])); + if (s) asm volatile ("vfld vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_15) : ); + if (e) asm volatile ("vfsd vf15, %[ptr]" : : [ptr]"r"(&c[ldc*15]) : ); + + asm volatile ("vfmsv.d vf19, %[ptr]" : : [ptr]"r"(a[16*lda])); + if (s) asm volatile ("vfld vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_16) : ); + if (e) asm volatile ("vfsd vf16, %[ptr]" : : [ptr]"r"(&c[ldc*16]) : ); + + asm volatile ("vfmsv.d vf18, %[ptr]" : : [ptr]"r"(a[17*lda])); + if (s) asm volatile ("vfld vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : ); + asm volatile ("vf 0(%[vf_ptr])" : : [vf_ptr]"r"(&hwacha_mm_17) : ); + if (e) asm volatile ("vfsd vf17, %[ptr]" : : [ptr]"r"(&c[ldc*17]) : ); +} + +static inline void mm_rb_hwacha(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + int vl; + asm volatile ("vsetcfg 4, 21" : : : ); + asm volatile ("vsetvl %[gvl], %[nvl]" : [gvl]"=r"(vl) : [nvl]"r"(n) : ); + + size_t mb = m/HRBM*HRBM, nk=p/HRBK*HRBK; + + for (size_t i=0; i +#include +#include +#include + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +static void mm_naive(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + t s0 = c[i*ldc+j], s1 = 0, s2 = 0, s3 = 0; + for (size_t k = 0; k < p/4*4; k+=4) + { + s0 = fma(a[i*lda+k+0], b[(k+0)*ldb+j], s0); + s1 = fma(a[i*lda+k+1], b[(k+1)*ldb+j], s1); + s2 = fma(a[i*lda+k+2], b[(k+2)*ldb+j], s2); + s3 = fma(a[i*lda+k+3], b[(k+3)*ldb+j], s3); + } + for (size_t k = p/4*4; k < p; k++) + s0 = fma(a[i*lda+k], b[k*ldb+j], s0); + c[i*ldc+j] = (s0 + s1) + (s2 + s3); + } + } +} + +static inline void mm_rb(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + size_t mb = m/RBM*RBM, nb = n/RBN*RBN; + for (size_t i = 0; i < mb; i += RBM) + { + for (size_t j = 0; j < nb; j += RBN) + kloop(p, a+i*lda, lda, b+j, ldb, c+i*ldc+j, ldc); + mm_naive(RBM, n - nb, p, a+i*lda, lda, b+nb, ldb, c+i*ldc+nb, ldc); + } + mm_naive(m - mb, n, p, a+mb*lda, lda, b, ldb, c+mb*ldc, ldc); +} + +static inline void repack(t* a, size_t lda, const t* a0, size_t lda0, size_t m, size_t p) +{ + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < p/8*8; j+=8) + { + t t0 = a0[i*lda0+j+0]; + t t1 = a0[i*lda0+j+1]; + t t2 = a0[i*lda0+j+2]; + t t3 = a0[i*lda0+j+3]; + t t4 = a0[i*lda0+j+4]; + t t5 = a0[i*lda0+j+5]; + t t6 = a0[i*lda0+j+6]; + t t7 = a0[i*lda0+j+7]; + a[i*lda+j+0] = t0; + a[i*lda+j+1] = t1; + a[i*lda+j+2] = t2; + a[i*lda+j+3] = t3; + a[i*lda+j+4] = t4; + a[i*lda+j+5] = t5; + a[i*lda+j+6] = t6; + a[i*lda+j+7] = t7; + } + for (size_t j = p/8*8; j < p; j++) + a[i*lda+j] = a0[i*lda0+j]; + } +} + +static void mm_cb(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + size_t nmb = m/CBM, nnb = n/CBN, npb = p/CBK; + size_t mb = nmb*CBM, nb = nnb*CBN, pb = npb*CBK; + //t a1[mb*pb], b1[pb*nb], c1[mb*nb]; + t* a1 = (t*)alloca_aligned(sizeof(t)*mb*pb, 8192); + t* b1 = (t*)alloca_aligned(sizeof(t)*pb*nb, 8192); + t* c1 = (t*)alloca_aligned(sizeof(t)*mb*nb, 8192); + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < pb; j += CBK) + repack(a1 + (npb*(i/CBM) + j/CBK)*(CBM*CBK), CBK, a + i*lda + j, lda, CBM, CBK); + + for (size_t i = 0; i < pb; i += CBK) + for (size_t j = 0; j < nb; j += CBN) + repack(b1 + (nnb*(i/CBK) + j/CBN)*(CBK*CBN), CBN, b + i*ldb + j, ldb, CBK, CBN); + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < nb; j += CBN) + repack(c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, c + i*ldc + j, ldc, CBM, CBN); + + for (size_t i = 0; i < mb; i += CBM) + { + for (size_t j = 0; j < nb; j += CBN) + { + for (size_t k = 0; k < pb; k += CBK) + { + mm_rb(CBM, CBN, CBK, + a1 + (npb*(i/CBM) + k/CBK)*(CBM*CBK), CBK, + b1 + (nnb*(k/CBK) + j/CBN)*(CBK*CBN), CBN, + c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN); + } + if (pb < p) + { + mm_rb(CBM, CBN, p - pb, + a + i*lda + pb, lda, + b + pb*ldb + j, ldb, + c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN); + } + } + if (nb < n) + { + for (size_t k = 0; k < p; k += CBK) + { + mm_rb(CBM, n - nb, MIN(p - k, CBK), + a + i*lda + k, lda, + b + k*ldb + nb, ldb, + c + i*ldc + nb, ldc); + } + } + } + if (mb < m) + { + for (size_t j = 0; j < n; j += CBN) + { + for (size_t k = 0; k < p; k += CBK) + { + mm_rb(m - mb, MIN(n - j, CBN), MIN(p - k, CBK), + a + mb*lda + k, lda, + b + k*ldb + j, ldb, + c + mb*ldc + j, ldc); + } + } + } + + for (size_t i = 0; i < mb; i += CBM) + for (size_t j = 0; j < nb; j += CBN) + repack(c + i*ldc + j, ldc, c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, CBM, CBN); +} + +void mm(size_t m, size_t n, size_t p, + t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc) +{ + if (__builtin_expect(m <= 2*CBM && n <= 2*CBN && p <= 2*CBK, 1)) + mm_rb(m, n, p, a, lda, b, ldb, c, ldc); + else + mm_cb(m, n, p, a, lda, b, ldb, c, ldc); +} diff --git a/benchmarks/mm/mm_main.c b/benchmarks/mm/mm_main.c new file mode 100644 index 0000000..522768a --- /dev/null +++ b/benchmarks/mm/mm_main.c @@ -0,0 +1,80 @@ +#include "common.h" +#include +#include +#include "util.h" + +void thread_entry(int cid, int nc) +{ + const int R = 8; + int m, n, p; + + if (have_vec) { + m = HCBM; + n = HCBN; + p = HCBK; + } else { + m = CBM; + n = CBN; + p = CBK; + } + + t a[m*p]; + t b[p*n]; + t c[m*n]; + + for (size_t i = 0; i < m; i++) + for (size_t j = 0; j < p; j++) + a[i*p+j] = i+j; + for (size_t i = 0; i < p; i++) + for (size_t j = 0; j < n; j++) + b[i*n+j] = i-j; + memset(c, 0, m*n*sizeof(c[0])); + + size_t instret, cycles; + if (have_vec) { + for (int i = 0; i < R; i++) + { + instret = -rdinstret(); + cycles = -rdcycle(); + mm_rb_hwacha(m, n, p, a, p, b, n, c, n); + instret += rdinstret(); + cycles += rdcycle(); + } + } else { + for (int i = 0; i < R; i++) + { + instret = -rdinstret(); + cycles = -rdcycle(); + mm(m, n, p, a, p, b, n, c, n); + instret += rdinstret(); + cycles += rdcycle(); + } + } + + printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n", + cid, RBM, RBN, RBK, CBM, CBN, CBK); + printf("C%d: %d instructions\n", cid, (int)(instret)); + printf("C%d: %d cycles\n", cid, (int)(cycles)); + printf("C%d: %d flops\n", cid, 2*m*n*p); + printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles)); + +#if 1 + for (size_t i = 0; i < m; i++) + { + for (size_t j = 0; j < n; j++) + { + t s = 0; + for (size_t aik = i, bkj = -j; aik < i+p; aik++, bkj++) + s += (t)aik*(t)bkj; + if (fabs(c[i*n+j]-s*R) > 1e-6*s) + { + printf("C%d: c[%lu][%lu] %f != %f\n", cid, i, j, c[i*n+j], s); + exit(1); + } + } + } +#endif + + barrier(nc); + exit(0); +} diff --git a/benchmarks/mm/rb.h b/benchmarks/mm/rb.h new file mode 100644 index 0000000..c5d5890 --- /dev/null +++ b/benchmarks/mm/rb.h @@ -0,0 +1,210 @@ +static const int RBM = 4, RBN = 5, RBK = 6; +static const int CBM = 36, CBN = 35, CBK = 36; +static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc) +{ + t* c_0 = &c[ldc*0]; + t* c_1 = &c[ldc*1]; + t* c_2 = &c[ldc*2]; + t* c_3 = &c[ldc*3]; + t c_0_0 = c_0[0]; + t c_0_1 = c_0[1]; + t c_0_2 = c_0[2]; + t c_0_3 = c_0[3]; + t c_0_4 = c_0[4]; + t c_1_0 = c_1[0]; + t c_1_1 = c_1[1]; + t c_1_2 = c_1[2]; + t c_1_3 = c_1[3]; + t c_1_4 = c_1[4]; + t c_2_0 = c_2[0]; + t c_2_1 = c_2[1]; + t c_2_2 = c_2[2]; + t c_2_3 = c_2[3]; + t c_2_4 = c_2[4]; + t c_3_0 = c_3[0]; + t c_3_1 = c_3[1]; + t c_3_2 = c_3[2]; + t c_3_3 = c_3[3]; + t c_3_4 = c_3[4]; + for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb) + { + t* a_0 = &a[lda*0]; + t* a_1 = &a[lda*1]; + t* a_2 = &a[lda*2]; + t* a_3 = &a[lda*3]; + t* b_0 = &b[ldb*0]; + t* b_1 = &b[ldb*1]; + t* b_2 = &b[ldb*2]; + t* b_3 = &b[ldb*3]; + t* b_4 = &b[ldb*4]; + t* b_5 = &b[ldb*5]; + c_0_0 = fma(a_0[0], b_0[0], c_0_0); + c_0_0 = fma(a_0[1], b_1[0], c_0_0); + c_0_0 = fma(a_0[2], b_2[0], c_0_0); + c_0_0 = fma(a_0[3], b_3[0], c_0_0); + c_0_0 = fma(a_0[4], b_4[0], c_0_0); + c_0_0 = fma(a_0[5], b_5[0], c_0_0); + c_0_1 = fma(a_0[0], b_0[1], c_0_1); + c_0_1 = fma(a_0[1], b_1[1], c_0_1); + c_0_1 = fma(a_0[2], b_2[1], c_0_1); + c_0_1 = fma(a_0[3], b_3[1], c_0_1); + c_0_1 = fma(a_0[4], b_4[1], c_0_1); + c_0_1 = fma(a_0[5], b_5[1], c_0_1); + c_0_2 = fma(a_0[0], b_0[2], c_0_2); + c_0_2 = fma(a_0[1], b_1[2], c_0_2); + c_0_2 = fma(a_0[2], b_2[2], c_0_2); + c_0_2 = fma(a_0[3], b_3[2], c_0_2); + c_0_2 = fma(a_0[4], b_4[2], c_0_2); + c_0_2 = fma(a_0[5], b_5[2], c_0_2); + c_0_3 = fma(a_0[0], b_0[3], c_0_3); + c_0_3 = fma(a_0[1], b_1[3], c_0_3); + c_0_3 = fma(a_0[2], b_2[3], c_0_3); + c_0_3 = fma(a_0[3], b_3[3], c_0_3); + c_0_3 = fma(a_0[4], b_4[3], c_0_3); + c_0_3 = fma(a_0[5], b_5[3], c_0_3); + c_0_4 = fma(a_0[0], b_0[4], c_0_4); + c_0_4 = fma(a_0[1], b_1[4], c_0_4); + c_0_4 = fma(a_0[2], b_2[4], c_0_4); + c_0_4 = fma(a_0[3], b_3[4], c_0_4); + c_0_4 = fma(a_0[4], b_4[4], c_0_4); + c_0_4 = fma(a_0[5], b_5[4], c_0_4); + c_1_0 = fma(a_1[0], b_0[0], c_1_0); + c_1_0 = fma(a_1[1], b_1[0], c_1_0); + c_1_0 = fma(a_1[2], b_2[0], c_1_0); + c_1_0 = fma(a_1[3], b_3[0], c_1_0); + c_1_0 = fma(a_1[4], b_4[0], c_1_0); + c_1_0 = fma(a_1[5], b_5[0], c_1_0); + c_1_1 = fma(a_1[0], b_0[1], c_1_1); + c_1_1 = fma(a_1[1], b_1[1], c_1_1); + c_1_1 = fma(a_1[2], b_2[1], c_1_1); + c_1_1 = fma(a_1[3], b_3[1], c_1_1); + c_1_1 = fma(a_1[4], b_4[1], c_1_1); + c_1_1 = fma(a_1[5], b_5[1], c_1_1); + c_1_2 = fma(a_1[0], b_0[2], c_1_2); + c_1_2 = fma(a_1[1], b_1[2], c_1_2); + c_1_2 = fma(a_1[2], b_2[2], c_1_2); + c_1_2 = fma(a_1[3], b_3[2], c_1_2); + c_1_2 = fma(a_1[4], b_4[2], c_1_2); + c_1_2 = fma(a_1[5], b_5[2], c_1_2); + c_1_3 = fma(a_1[0], b_0[3], c_1_3); + c_1_3 = fma(a_1[1], b_1[3], c_1_3); + c_1_3 = fma(a_1[2], b_2[3], c_1_3); + c_1_3 = fma(a_1[3], b_3[3], c_1_3); + c_1_3 = fma(a_1[4], b_4[3], c_1_3); + c_1_3 = fma(a_1[5], b_5[3], c_1_3); + c_1_4 = fma(a_1[0], b_0[4], c_1_4); + c_1_4 = fma(a_1[1], b_1[4], c_1_4); + c_1_4 = fma(a_1[2], b_2[4], c_1_4); + c_1_4 = fma(a_1[3], b_3[4], c_1_4); + c_1_4 = fma(a_1[4], b_4[4], c_1_4); + c_1_4 = fma(a_1[5], b_5[4], c_1_4); + c_2_0 = fma(a_2[0], b_0[0], c_2_0); + c_2_0 = fma(a_2[1], b_1[0], c_2_0); + c_2_0 = fma(a_2[2], b_2[0], c_2_0); + c_2_0 = fma(a_2[3], b_3[0], c_2_0); + c_2_0 = fma(a_2[4], b_4[0], c_2_0); + c_2_0 = fma(a_2[5], b_5[0], c_2_0); + c_2_1 = fma(a_2[0], b_0[1], c_2_1); + c_2_1 = fma(a_2[1], b_1[1], c_2_1); + c_2_1 = fma(a_2[2], b_2[1], c_2_1); + c_2_1 = fma(a_2[3], b_3[1], c_2_1); + c_2_1 = fma(a_2[4], b_4[1], c_2_1); + c_2_1 = fma(a_2[5], b_5[1], c_2_1); + c_2_2 = fma(a_2[0], b_0[2], c_2_2); + c_2_2 = fma(a_2[1], b_1[2], c_2_2); + c_2_2 = fma(a_2[2], b_2[2], c_2_2); + c_2_2 = fma(a_2[3], b_3[2], c_2_2); + c_2_2 = fma(a_2[4], b_4[2], c_2_2); + c_2_2 = fma(a_2[5], b_5[2], c_2_2); + c_2_3 = fma(a_2[0], b_0[3], c_2_3); + c_2_3 = fma(a_2[1], b_1[3], c_2_3); + c_2_3 = fma(a_2[2], b_2[3], c_2_3); + c_2_3 = fma(a_2[3], b_3[3], c_2_3); + c_2_3 = fma(a_2[4], b_4[3], c_2_3); + c_2_3 = fma(a_2[5], b_5[3], c_2_3); + c_2_4 = fma(a_2[0], b_0[4], c_2_4); + c_2_4 = fma(a_2[1], b_1[4], c_2_4); + c_2_4 = fma(a_2[2], b_2[4], c_2_4); + c_2_4 = fma(a_2[3], b_3[4], c_2_4); + c_2_4 = fma(a_2[4], b_4[4], c_2_4); + c_2_4 = fma(a_2[5], b_5[4], c_2_4); + c_3_0 = fma(a_3[0], b_0[0], c_3_0); + c_3_0 = fma(a_3[1], b_1[0], c_3_0); + c_3_0 = fma(a_3[2], b_2[0], c_3_0); + c_3_0 = fma(a_3[3], b_3[0], c_3_0); + c_3_0 = fma(a_3[4], b_4[0], c_3_0); + c_3_0 = fma(a_3[5], b_5[0], c_3_0); + c_3_1 = fma(a_3[0], b_0[1], c_3_1); + c_3_1 = fma(a_3[1], b_1[1], c_3_1); + c_3_1 = fma(a_3[2], b_2[1], c_3_1); + c_3_1 = fma(a_3[3], b_3[1], c_3_1); + c_3_1 = fma(a_3[4], b_4[1], c_3_1); + c_3_1 = fma(a_3[5], b_5[1], c_3_1); + c_3_2 = fma(a_3[0], b_0[2], c_3_2); + c_3_2 = fma(a_3[1], b_1[2], c_3_2); + c_3_2 = fma(a_3[2], b_2[2], c_3_2); + c_3_2 = fma(a_3[3], b_3[2], c_3_2); + c_3_2 = fma(a_3[4], b_4[2], c_3_2); + c_3_2 = fma(a_3[5], b_5[2], c_3_2); + c_3_3 = fma(a_3[0], b_0[3], c_3_3); + c_3_3 = fma(a_3[1], b_1[3], c_3_3); + c_3_3 = fma(a_3[2], b_2[3], c_3_3); + c_3_3 = fma(a_3[3], b_3[3], c_3_3); + c_3_3 = fma(a_3[4], b_4[3], c_3_3); + c_3_3 = fma(a_3[5], b_5[3], c_3_3); + c_3_4 = fma(a_3[0], b_0[4], c_3_4); + c_3_4 = fma(a_3[1], b_1[4], c_3_4); + c_3_4 = fma(a_3[2], b_2[4], c_3_4); + c_3_4 = fma(a_3[3], b_3[4], c_3_4); + c_3_4 = fma(a_3[4], b_4[4], c_3_4); + c_3_4 = fma(a_3[5], b_5[4], c_3_4); + } + for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb) + { + t* a_0 = &a[lda*0]; + t* a_1 = &a[lda*1]; + t* a_2 = &a[lda*2]; + t* a_3 = &a[lda*3]; + t* b_0 = &b[ldb*0]; + c_0_0 = fma(a_0[0], b_0[0], c_0_0); + c_0_1 = fma(a_0[0], b_0[1], c_0_1); + c_0_2 = fma(a_0[0], b_0[2], c_0_2); + c_0_3 = fma(a_0[0], b_0[3], c_0_3); + c_0_4 = fma(a_0[0], b_0[4], c_0_4); + c_1_0 = fma(a_1[0], b_0[0], c_1_0); + c_1_1 = fma(a_1[0], b_0[1], c_1_1); + c_1_2 = fma(a_1[0], b_0[2], c_1_2); + c_1_3 = fma(a_1[0], b_0[3], c_1_3); + c_1_4 = fma(a_1[0], b_0[4], c_1_4); + c_2_0 = fma(a_2[0], b_0[0], c_2_0); + c_2_1 = fma(a_2[0], b_0[1], c_2_1); + c_2_2 = fma(a_2[0], b_0[2], c_2_2); + c_2_3 = fma(a_2[0], b_0[3], c_2_3); + c_2_4 = fma(a_2[0], b_0[4], c_2_4); + c_3_0 = fma(a_3[0], b_0[0], c_3_0); + c_3_1 = fma(a_3[0], b_0[1], c_3_1); + c_3_2 = fma(a_3[0], b_0[2], c_3_2); + c_3_3 = fma(a_3[0], b_0[3], c_3_3); + c_3_4 = fma(a_3[0], b_0[4], c_3_4); + } + c_0[0] = c_0_0; + c_0[1] = c_0_1; + c_0[2] = c_0_2; + c_0[3] = c_0_3; + c_0[4] = c_0_4; + c_1[0] = c_1_0; + c_1[1] = c_1_1; + c_1[2] = c_1_2; + c_1[3] = c_1_3; + c_1[4] = c_1_4; + c_2[0] = c_2_0; + c_2[1] = c_2_1; + c_2[2] = c_2_2; + c_2[3] = c_2_3; + c_2[4] = c_2_4; + c_3[0] = c_3_0; + c_3[1] = c_3_1; + c_3[2] = c_3_2; + c_3[3] = c_3_3; + c_3[4] = c_3_4; +} diff --git a/benchmarks/mt-matmul/mt-matmul.c b/benchmarks/mt-matmul/mt-matmul.c index e795b50..1584a5d 100644 --- a/benchmarks/mt-matmul/mt-matmul.c +++ b/benchmarks/mt-matmul/mt-matmul.c @@ -33,8 +33,6 @@ typedef double data_t; // Basic Utilities and Multi-thread Support __thread unsigned long coreid; -unsigned long ncores; -#define ncores ncores #include "util.h" @@ -93,15 +91,14 @@ void __attribute__((noinline)) matmul(const int lda, const data_t A[], const da void thread_entry(int cid, int nc) { coreid = cid; - ncores = nc; // static allocates data in the binary, which is visible to both threads static data_t results_data[ARRAY_SIZE]; // Execute the provided, naive matmul - barrier(); - stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier()); + barrier(nc); + stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc)); // verify @@ -115,12 +112,12 @@ void thread_entry(int cid, int nc) if (coreid == 0) for (i=0; i < ARRAY_SIZE; i++) results_data[i] = 0; - barrier(); + barrier(nc); // Execute your faster matmul - barrier(); - stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier()); + barrier(nc); + stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc)); #ifdef DEBUG printArray("results:", ARRAY_SIZE, results_data); @@ -131,7 +128,7 @@ void thread_entry(int cid, int nc) res = verify(ARRAY_SIZE, results_data, verify_data); if (res) exit(res); - barrier(); + barrier(nc); #endif exit(0); diff --git a/benchmarks/mt-vvadd/mt-vvadd.c b/benchmarks/mt-vvadd/mt-vvadd.c index 60aa2e7..2116115 100644 --- a/benchmarks/mt-vvadd/mt-vvadd.c +++ b/benchmarks/mt-vvadd/mt-vvadd.c @@ -32,8 +32,6 @@ typedef double data_t; // Basic Utilities and Multi-thread Support __thread unsigned long coreid; -unsigned long ncores; -#define ncores ncores #include "util.h" @@ -52,7 +50,7 @@ unsigned long ncores; // vvadd function //perform in-place vvadd -void __attribute__((noinline)) vvadd(size_t n, data_t* __restrict__ x, const data_t* __restrict__ y) +void __attribute__((noinline)) vvadd(int ncores, size_t n, data_t* __restrict__ x, const data_t* __restrict__ y) { size_t i; @@ -79,7 +77,6 @@ void __attribute__((noinline)) vvadd_opt(size_t n, data_t* __restrict__ x, const void thread_entry(int cid, int nc) { coreid = cid; - ncores = nc; // static allocates data in the binary, which is visible to both threads static data_t results_data[DATA_SIZE]; @@ -97,8 +94,8 @@ void thread_entry(int cid, int nc) // Execute the provided, terrible vvadd - barrier(); - stats(vvadd(DATA_SIZE, results_data, input2_data); barrier()); + barrier(nc); + stats(vvadd(nc, DATA_SIZE, results_data, input2_data); barrier(nc)); // verify @@ -113,11 +110,11 @@ void thread_entry(int cid, int nc) for (i=0; i < DATA_SIZE; i++) results_data[i] = input1_data[i]; } - barrier(); + barrier(nc); // Execute your faster vvadd - barrier(); - stats(vvadd_opt(DATA_SIZE, results_data, input2_data); barrier()); + barrier(nc); + stats(vvadd_opt(DATA_SIZE, results_data, input2_data); barrier(nc)); #ifdef DEBUG printDoubleArray("results: ", DATA_SIZE, results_data); @@ -128,7 +125,7 @@ void thread_entry(int cid, int nc) res = verifyDouble(DATA_SIZE, results_data, verify_data); if (res) exit(res); - barrier(); + barrier(nc); #endif exit(0); -- 2.30.2