1. C++ / Говнокод #28432

    −7

    1. 01
    2. 02
    3. 03
    4. 04
    5. 05
    6. 06
    7. 07
    8. 08
    9. 09
    10. 10
    11. 11
    12. 12
    13. 13
    14. 14
    15. 15
    16. 16
    17. 17
    18. 18
    19. 19
    20. 20
    21. 21
    22. 22
    23. 23
    24. 24
    25. 25
    26. 26
    27. 27
    28. 28
    29. 29
    30. 30
    31. 31
    32. 32
    33. 33
    34. 34
    35. 35
    36. 36
    37. 37
    38. 38
    39. 39
    40. 40
    41. 41
    42. 42
    43. 43
    44. 44
    45. 45
    46. 46
    47. 47
    48. 48
    49. 49
    50. 50
    51. 51
    52. 52
    53. 53
    54. 54
    55. 55
    56. 56
    57. 57
    58. 58
    59. 59
    60. 60
    61. 61
    62. 62
    63. 63
    64. 64
    65. 65
    66. 66
    67. 67
    68. 68
    69. 69
    70. 70
    71. 71
    72. 72
    73. 73
    74. 74
    75. 75
    76. 76
    77. 77
    78. 78
    79. 79
    80. 80
    81. 81
    82. 82
    83. 83
    84. 84
    85. 85
    86. 86
    87. 87
    88. 88
    if (neuronsV.size() >= 8)
    	{
    		auto count = neuronsV.size() - neuronsV.size() % 8;
    
    		__m256* vs = static_cast<__m256*>(alloca(count * sizeof(float) * 3));
    		__m256* ws = vs + count / 8;
    		__m256* ins = ws + count / 8;
    
    		for (int i = 0; i < count / 8; ++i)
    		{
    			vs[i] = _mm256_load_ps(&neuronsV[i * 8]);
    			ws[i] = _mm256_load_ps(&neuronsW[i * 8]);
    			ins[i] = _mm256_load_ps(&neuronsIn[i * 8]);
    		}
    
    		static const __m256 div3 = [](void) -> __m256
    		{
    			float temp[]{ 3.f, 3.f, 3.f, 3.f, 3.f, 3.f, 3.f, 3.f };
    			return _mm256_load_ps(temp);
    		}();
    
    		__m256 iextv = [&iext](void) -> __m256
    		{
    			float temp[]{ iext, iext, iext, iext, iext, iext, iext, iext };
    			return _mm256_load_ps(temp);
    		}();
    
    		__m256 exprdtv = [&dt](void) -> __m256
    		{
    			float temp[]{ expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt };
    			return _mm256_load_ps(temp);
    		}();
    
    		static const __m256 av = [](void) -> __m256
    		{
    			float temp[]{ a, a, a, a, a, a, a, a };
    			return _mm256_load_ps(temp);
    		}();
    
    		static const __m256 bv = [](void) -> __m256
    		{
    			float temp[]{ b, b, b, b, b, b, b, b };
    			return _mm256_load_ps(temp);
    		}();
    
    		__m256 thetadt = [&dt](void) -> __m256
    		{
    			float temp[]{ dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta };
    			return _mm256_load_ps(temp);
    		}();
    
    		for (int i = 0; i < count / 8; ++i)
    		{
    			// vs += (vs - (vs * vs * vs) / 3.f - ws - iext - ins) * exprdtv
    			__m256 nv = _mm256_mul_ps(vs[i], _mm256_mul_ps(vs[i], vs[i]));
    			nv = _mm256_sub_ps(vs[i], _mm256_div_ps(nv, div3));
    			nv = _mm256_sub_ps(nv, ws[i]);
    			nv = _mm256_add_ps(nv, iextv);
    			nv = _mm256_add_ps(nv, ins[i]);
    			nv = _mm256_mul_ps(nv, exprdtv);
    			vs[i] = _mm256_add_ps(vs[i], nv);
    
    			// ws += (vs - av - ws * bv) * thetadt
    			nv = _mm256_sub_ps(vs[i], av);
    			nv = _mm256_sub_ps(nv, _mm256_mul_ps(ws[i], bv));
    			ws[i] = _mm256_add_ps(ws[i], _mm256_mul_ps(nv, thetadt));
    		}
    
    		for (int i = 0; i < count / 8; ++i)
    		{
    			_mm256_storeu_ps(&neuronsV[i * 8], vs[i]);
    			_mm256_storeu_ps(&neuronsW[i * 8], ws[i]);
    		}
    	}
    
    	// scalar edition
    	for (int i = 0; i < neuronsV.size() % 8; ++i)
    	{
    		auto off = neuronsV.size() - 1 - i;
    		auto& v = neuronsV[off];
    		auto& w = neuronsW[off];
    		auto& in = neuronsIn[off];
    
    		v += (v - (v * v * v) / 3.f - w - iext - in) * expr * dt;
    		w += (v - a - w * b) * dt / theta;
    	}
    
    	std::ranges::fill(neuronsIn, 0.f);

    Царский анрол

    Запостил: kcalbCube, 31 Октября 2022

    Комментарии (14) RSS

    Добавить комментарий