25template <
typename T>
inline
26void SpmvCsr(T *val,
int *cind,
int *rptr, T *x, T *y,
int N) {
27 #pragma omp parallel for
28 for(
int i=0; i<N; i++) {
30 for(
int j=rptr[i]; j<rptr[i+1]; j++) {
31 temp += val[j] * x[cind[j]];
47template <
typename T>
inline
48void SpmvCsr(T *val,
int *cind,
int *rptr, T *diag, T *x, T *y,
int N) {
49 #pragma omp parallel for
50 for(
int i=0; i<N; i++) {
51 T temp = x[i] * diag[i];
52 for(
int j=rptr[i]; j<rptr[i+1]; j++) {
53 temp += val[j] * x[cind[j]];
70template <
typename T,
int bnl,
int bnw>
inline
71void SpmvBcsr(T *bval,
int *bcind,
int *brptr, T *x, T *y,
int N) {
72 int b_size = bnl * bnw;
73 #pragma omp parallel for
74 for(
int i=0; i<N; i+=bnl) {
76 #pragma omp simd simdlen(bnl)
77 for(
int j=0; j<bnl; j++) {
80 for(
int j=brptr[bidx]; j<brptr[bidx+1]; j++) {
81 int x_ind = bcind[j]*bnw;
82 for(
int l=0; l<bnw; l++) {
83 int off = j*b_size+l*bnl;
84 #pragma omp simd simdlen(bnl)
85 for(
int k=0; k<bnl; k++) {
86 y[i+k] += bval[off+k] * x[x_ind+l];
103template <
typename T>
inline
104void SpmvSell(T *val,
int *cind,
int *wid,
int len, T *x, T *y,
int N)
106 int block = (N+len-1)/len;
107 #pragma omp parallel for
108 for(
int i=0; i<block; i++) {
109 int start = wid[i] * len;
110 int temp = (i==len-1 && N%len!=0) ? N % len : len;
111 for(
int k=0; k<temp; k++) {
112 y[i*len+k] = val[start+k] * x[cind[start+k]];
114 for(
int j=1; j<wid[i+1]-wid[i]; j++) {
115 int off = start+j*len;
116 for(
int k=0; k<temp; k++) {
117 y[i*len+k] += val[off+k] * x[cind[off+k]];
132template <
typename T>
inline
136 for(
int i=0; i<N; i++) {
138 for(
int j=rptr[i]; j<rptr[i+1]; j++) {
139 temp -= val[j] * y[cind[j]];
154template <
typename T>
inline
159 for(
int i=N-1; i>=0; i--) {
162 for(j=rptr[i+1]-1; j>=rptr[i]+1; j--) {
163 temp -= val[j] * y[cind[j]];
165 y[i] = temp * val[j];
180template <
typename T>
inline
182 T *val,
int *cind,
int *rptr, T *x, T *y,
183 int N,
int *cptr,
int cnum)
188 for(
int k=0; k<cnum; k++) {
192 for(
int i=start; i<end; i++) {
194 for(
int j=rptr[i]; j<rptr[i+1]; j++) {
195 temp -= val[j] * y[cind[j]];
214template <
typename T>
inline
216 int N,
int *cptr,
int cnum)
222 for(
int k=cnum-1; k>=0; k--) {
226 for(
int i=end-1; i>=start; i--) {
229 for(j=rptr[i+1]-1; j>=rptr[i]+1; j--) {
230 temp -= val[j] * y[cind[j]];
232 y[i] = temp * val[j];
250template <
typename T>
inline
252 T *val,
int *cind,
int *rptr, T *x, T *y,
253 int N,
int *cptr,
int cnum,
int bsize)
258 for(
int k=0; k<cnum; k++) {
262 for(
int i=start; i<end; i++) {
264 for(
int l=0; l<bsize; l++) {
267 for(
int j=rptr[idx]; j<rptr[idx+1]; j++) {
268 temp -= val[j] * y[cind[j]];
289template <
typename T>
inline
291 int N,
int *cptr,
int cnum,
int bsize)
297 for(
int k=cnum-1; k>=0; k--) {
301 for(
int i=end-1; i>=start; i--) {
303 for(
int l=bsize-1; l>=0; l--) {
307 for(j=rptr[idx+1]-1; j>=rptr[idx]+1; j--) {
308 temp -= val[j] * y[cind[j]];
310 y[idx] = temp * val[j];
328template <
typename T>
inline
333 int bsize = N / bnum;
334 #pragma omp parallel for num_threads(bnum)
335 for(
int k=0; k<bnum; k++) {
337 int end = (k+1)*bsize;
338 for(
int i=start; i<end; i++) {
340 for(
int j=rptr[i]; j<rptr[i+1]; j++) {
341 temp -= val[j] * y[cind[j]];
359template <
typename T>
inline
365 int bsize = N / bnum;
366 #pragma omp parallel for num_threads(bnum)
367 for(
int k=0; k<bnum; k++) {
369 int end = (k+1)*bsize;
370 for(
int i=end-1; i>=start; i--) {
373 for(j=rptr[i+1]-1; j>=rptr[i]+1; j--) {
374 temp -= val[j] * y[cind[j]];
376 y[i] = temp * val[j];
392template <
typename T,
int bnl,
int bnw>
inline
394 T *bval,
int *bcind,
int *brptr,
398 int b_size = bnl * bnw;
399 for(
int i=0; i<N; i+=bnl) {
401 #pragma omp simd simdlen(bnl)
402 for(
int j=0; j<bnl; j++) {
405 for(
int j=brptr[bidx]; j<brptr[bidx+1]; j++) {
406 int x_ind = bcind[j]*bnw;
407 for(
int l=0; l<bnw; l++) {
408 int off = j*b_size+l*bnl;
409 #pragma omp simd simdlen(bnl)
410 for(
int k=0; k<bnl; k++) {
411 y[i+k] -= bval[off+k] * y[x_ind+l];
429template <
typename T,
int bnl,
int bnw>
inline
431 T *bval,
int *bcind,
int *brptr,
434 int b_size = bnl * bnw;
435 int b_rem = bnl / bnw;
436 for(
int i=N-bnl; i>=0; i-=bnl) {
438 #pragma omp simd simdlen(bnl)
439 for(
int j=0; j<bnl; j++) {
442 for(
int j=brptr[bidx+1]-1; j>=brptr[bidx]+b_rem; j--) {
443 int x_ind = bcind[j]*bnw;
444 for(
int l=0; l<bnw; l++) {
445 int off = j*b_size+l*bnl;
446 #pragma omp simd simdlen(bnl)
447 for(
int k=0; k<bnl; k++) {
448 y[i+k] -= bval[off+k] * y[x_ind+l];
452 int pos = brptr[bidx]+b_rem-1;
453 for(
int k=b_rem-1; k>=0; k--) {
454 for(
int j=bnw-1; j>=0; j--) {
455 int off = pos*b_size+j*bnl;
457 y[i+idx] *= bval[off+idx];
458 for(
int l=k*bnw+j-1; l>=0; l--) {
459 y[i+l] -= bval[off+l] * y[i+idx];
481template <
typename T,
int bnl,
int bnw>
inline
483 T *bval,
int *bcind,
int *brptr, T *x, T *y,
484 int N,
int *cptr,
int cnum,
int bsize)
487 int b_size = bnl * bnw;
490 for(
int k=0; k<cnum; k++) {
494 for(
int i=start; i<end; i++) {
496 for(
int l=0; l<bsize; l+=bnl) {
498 int bidx = idx / bnl;
499 #pragma omp simd simdlen(bnl)
500 for(
int j=0; j<bnl; j++) {
503 for(
int j=brptr[bidx]; j<brptr[bidx+1]; j++) {
504 int x_ind = bcind[j]*bnw;
505 for(
int m=0; m<bnw; m++) {
506 int off = j*b_size+m*bnl;
507 #pragma omp simd simdlen(bnl)
508 for(
int n=0; n<bnl; n++) {
509 y[idx+n] -= bval[off+n] * y[x_ind+m];
533template <
typename T,
int bnl,
int bnw>
inline
535 T *bval,
int *bcind,
int *brptr, T *x, T *y,
536 int N,
int *cptr,
int cnum,
int bsize)
538 int b_size = bnl * bnw;
539 int b_rem = bnl / bnw;
542 for(
int k=cnum-1; k>=0; k--) {
546 for(
int i=end-1; i>=start; i--) {
548 for(
int l=bsize-bnl; l>=0; l-=bnl) {
550 int bidx = idx / bnl;
551 #pragma omp simd simdlen(bnl)
552 for(
int j=0; j<bnl; j++) {
555 for(
int j=brptr[bidx+1]-1; j>=brptr[bidx]+b_rem; j--) {
556 int x_ind = bcind[j]*bnw;
557 for(
int n=0; n<bnw; n++) {
558 int off = j*b_size+n*bnl;
559 #pragma omp simd simdlen(bnl)
560 for(
int m=0; m<bnl; m++) {
561 y[idx+m] -= bval[off+m] * y[x_ind+n];
565 int pos = brptr[bidx]+b_rem-1;
566 for(
int m=b_rem-1; m>=0; m--) {
567 for(
int j=bnw-1; j>=0; j--) {
568 int off = pos*b_size+j*bnl;
570 y[idx+ind] *= bval[off+ind];
571 for(
int n=m*bnw+j-1; n>=0; n--) {
572 y[idx+n] -= bval[off+n] * y[idx+ind];
void SpmvCsr(T *val, int *cind, int *rptr, T *x, T *y, int N)
Perform SpMV using the CSR format.
void SptrsvBcsr_u(T *bval, int *bcind, int *brptr, T *x, T *y, int N)
Perform the sparse upper triangular solve for a matrix stored in the CSR format.
void SpmvBcsr(T *bval, int *bcind, int *brptr, T *x, T *y, int N)
Perform SpMV using the BCSR format.
void SpmvSell(T *val, int *cind, int *wid, int len, T *x, T *y, int N)
Perform SpMV using the sliced-ELLPACK (SELL-c) format.
void SptrsvCsr_l(T *val, int *cind, int *rptr, T *x, T *y, int N)
Perform the sparse lower triangular solve on a matrix stored in the CSR format.
void SptrsvBcsr_l(T *bval, int *bcind, int *brptr, T *x, T *y, int N)
Perform the sparse lower triangular solve for a matrix stored in the BCSR format.
void SptrsvCsr_u(T *val, int *cind, int *rptr, T *x, T *y, int N)
Perform the sparse upper triangular solve on a matrix stored in the CSR format.
The top-level namespace of SenK.