| Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE |
[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]
|
Hello Christoph and Gael, I tried to extract the relevant assembly and attached it. In the file eigen_forceinline.txt is the critical part generated by default, the file eigen_inline.txt is the same but with EIGEN_STRONG_INLINE set to "inline". The compiler options are in both cases -O3 -g -DNDEBUG -fPIC -xHost -qopenmp -std=gnu++11 Thanks and regards, PS.: Thank you :-) |
#pragma omp for schedule(static)
837c9b: 41 ff cf dec %r15d
837c9e: bb 01 00 00 00 mov $0x1,%ebx
837ca3: 44 89 ac 24 20 01 00 mov %r13d,0x120(%rsp)
837caa: 00
837cab: 48 8d 3d 22 77 37 00 lea 0x377722(%rip),%rdi # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
837cb2: 44 89 bc 24 24 01 00 mov %r15d,0x124(%rsp)
837cb9: 00
837cba: ba 22 00 00 00 mov $0x22,%edx
837cbf: 44 89 ac 24 28 01 00 mov %r13d,0x128(%rsp)
837cc6: 00
837cc7: 89 9c 24 2c 01 00 00 mov %ebx,0x12c(%rsp)
837cce: 48 83 c4 e0 add $0xffffffffffffffe0,%rsp
837cd2: 48 8d 84 24 4c 01 00 lea 0x14c(%rsp),%rax
837cd9: 00
837cda: 48 8d 8c 24 48 01 00 lea 0x148(%rsp),%rcx
837ce1: 00
837ce2: 4c 8d 84 24 40 01 00 lea 0x140(%rsp),%r8
837ce9: 00
837cea: 48 89 04 24 mov %rax,(%rsp)
837cee: 4c 8d 8c 24 44 01 00 lea 0x144(%rsp),%r9
837cf5: 00
837cf6: 89 5c 24 08 mov %ebx,0x8(%rsp)
837cfa: 89 5c 24 10 mov %ebx,0x10(%rsp)
837cfe: 8b 70 3c mov 0x3c(%rax),%esi
837d01: e8 3a 24 db ff callq 5ea140 <__kmpc_for_static_init_4u@plt>
837d06: 48 83 c4 20 add $0x20,%rsp
837d0a: 8b 84 24 20 01 00 00 mov 0x120(%rsp),%eax
837d11: 8b 94 24 24 01 00 00 mov 0x124(%rsp),%edx
837d18: 41 3b c7 cmp %r15d,%eax
837d1b: 0f 87 ca 00 00 00 ja 837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
837d21: 41 3b d7 cmp %r15d,%edx
837d24: 44 0f 42 fa cmovb %edx,%r15d
for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
837d28: 41 3b c7 cmp %r15d,%eax
837d2b: 0f 87 ba 00 00 00 ja 837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
#pragma omp for schedule(static)
837d31: 44 2b f8 sub %eax,%r15d
837d34: 44 89 eb mov %r13d,%ebx
837d37: 44 89 b4 24 f8 00 00 mov %r14d,0xf8(%rsp)
837d3e: 00
837d3f: 41 ff c7 inc %r15d
837d42: 41 89 c6 mov %eax,%r14d
unsigned int mat_idx = m_mat_indices[i];
837d45: 4d 8b 04 24 mov (%r12),%r8
for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
837d49: 45 8d 2c 1e lea (%r14,%rbx,1),%r13d
837d4d: 4d 63 ed movslq %r13d,%r13
EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
837d50: 48 8d 3c 24 lea (%rsp),%rdi
837d54: 48 8d 74 24 40 lea 0x40(%rsp),%rsi
unsigned int mat_idx = m_mat_indices[i];
837d59: 49 8b 90 b8 00 00 00 mov 0xb8(%r8),%rdx
{ return *(this->_M_impl._M_start + __n); }
837d60: 4d 8b 98 d8 00 00 00 mov 0xd8(%r8),%r11
(m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
837d67: 4d 8b 48 60 mov 0x60(%r8),%r9
837d6b: 4f 8d 54 6d 00 lea 0x0(%r13,%r13,2),%r10
unsigned int mat_idx = m_mat_indices[i];
837d70: 42 8b 0c aa mov (%rdx,%r13,4),%ecx
837d74: 48 69 d1 38 02 00 00 imul $0x238,%rcx,%rdx
(m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
837d7b: 4f 8d 2c d1 lea (%r9,%r10,8),%r13
}
// static inline void
static void
update_density(const sim_constants &sc, density& d, real e, real *p_t) {
d = sc.A1 * (d + sc.d_in) - sc.d_in;
837d7f: 4c 89 6e 08 mov %r13,0x8(%rsi)
837d83: 49 8d 8c 13 08 02 00 lea 0x208(%r11,%rdx,1),%rcx
837d8a: 00
837d8b: 48 89 4e 10 mov %rcx,0x10(%rsi)
837d8f: 48 89 4e 20 mov %rcx,0x20(%rsi)
837d93: 49 8d 94 13 90 01 00 lea 0x190(%r11,%rdx,1),%rdx
837d9a: 00
837d9b: 48 89 16 mov %rdx,(%rsi)
837d9e: e8 dd 96 db ff callq 5f1480 <Eigen::internal::binary_evaluator<Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, Eigen::internal::IndexBased, Eigen::internal::IndexBased, double, double>::binary_evaluator(Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const> const&)@plt>
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
837da3: 48 8b 54 24 08 mov 0x8(%rsp),%rdx
#pragma omp for schedule(static)
837da8: ff c3 inc %ebx
837daa: 48 8b 4c 24 30 mov 0x30(%rsp),%rcx
837daf: 41 3b df cmp %r15d,%ebx
return _mm_loadu_pd(from);
837db2: c5 f9 10 02 vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
837db6: c5 f9 5c 09 vsubpd (%rcx),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
837dba: c4 c1 79 11 4d 00 vmovupd %xmm1,0x0(%r13)
837dc0: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
837dc5: 48 8b 7c 24 30 mov 0x30(%rsp),%rdi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
837dca: c5 fb 10 56 10 vmovsd 0x10(%rsi),%xmm2
837dcf: c5 eb 5c 5f 10 vsubsd 0x10(%rdi),%xmm2,%xmm3
837dd4: c4 c1 7b 11 5d 10 vmovsd %xmm3,0x10(%r13)
837dda: 0f 82 65 ff ff ff jb 837d45 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xb05>
837de0: 44 8b b4 24 f8 00 00 mov 0xf8(%rsp),%r14d
837de7: 00
837de8: 45 33 ed xor %r13d,%r13d
837deb: 48 8d 3d e2 75 37 00 lea 0x3775e2(%rip),%rdi # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
837df2: 8b b4 24 68 01 00 00 mov 0x168(%rsp),%esi
837df9: e8 92 a7 de ff callq 622590 <__kmpc_for_static_fini@plt>
#pragma omp for schedule(static)
8468db: 41 ff cf dec %r15d
8468de: bb 01 00 00 00 mov $0x1,%ebx
8468e3: 44 89 ac 24 18 01 00 mov %r13d,0x118(%rsp)
8468ea: 00
8468eb: 48 8d 3d a2 ab 37 00 lea 0x37aba2(%rip),%rdi # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
8468f2: 44 89 bc 24 1c 01 00 mov %r15d,0x11c(%rsp)
8468f9: 00
8468fa: ba 22 00 00 00 mov $0x22,%edx
8468ff: 44 89 ac 24 20 01 00 mov %r13d,0x120(%rsp)
846906: 00
846907: 89 9c 24 24 01 00 00 mov %ebx,0x124(%rsp)
84690e: 48 83 c4 e0 add $0xffffffffffffffe0,%rsp
846912: 48 8d 84 24 44 01 00 lea 0x144(%rsp),%rax
846919: 00
84691a: 48 8d 8c 24 40 01 00 lea 0x140(%rsp),%rcx
846921: 00
846922: 4c 8d 84 24 38 01 00 lea 0x138(%rsp),%r8
846929: 00
84692a: 48 89 04 24 mov %rax,(%rsp)
84692e: 4c 8d 8c 24 3c 01 00 lea 0x13c(%rsp),%r9
846935: 00
846936: 89 5c 24 08 mov %ebx,0x8(%rsp)
84693a: 89 5c 24 10 mov %ebx,0x10(%rsp)
84693e: 8b 70 3c mov 0x3c(%rax),%esi
846941: e8 7a a0 da ff callq 5f09c0 <__kmpc_for_static_init_4u@plt>
846946: 48 83 c4 20 add $0x20,%rsp
84694a: 8b 84 24 18 01 00 00 mov 0x118(%rsp),%eax
846951: 8b 94 24 1c 01 00 00 mov 0x11c(%rsp),%edx
846958: 41 3b c7 cmp %r15d,%eax
84695b: 0f 87 e0 00 00 00 ja 846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
846961: 41 3b d7 cmp %r15d,%edx
846964: 44 0f 42 fa cmovb %edx,%r15d
for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
846968: 41 3b c7 cmp %r15d,%eax
84696b: 0f 87 d0 00 00 00 ja 846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
#pragma omp for schedule(static)
846971: 44 2b f8 sub %eax,%r15d
846974: 44 89 eb mov %r13d,%ebx
846977: 41 ff c7 inc %r15d
EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
84697a: 44 89 bc 24 e8 00 00 mov %r15d,0xe8(%rsp)
846981: 00
846982: 41 89 c7 mov %eax,%r15d
846985: 44 89 b4 24 f0 00 00 mov %r14d,0xf0(%rsp)
84698c: 00
unsigned int mat_idx = m_mat_indices[i];
84698d: 4d 8b 04 24 mov (%r12),%r8
for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
846991: 45 8d 2c 1f lea (%r15,%rbx,1),%r13d
846995: 4d 63 ed movslq %r13d,%r13
SrcEvaluatorType srcEvaluator(src);
846998: 48 8d 7c 24 18 lea 0x18(%rsp),%rdi
84699d: 48 89 7f f0 mov %rdi,-0x10(%rdi)
call_dense_assignment_loop(dst, src, func);
8469a1: 48 8d 74 24 68 lea 0x68(%rsp),%rsi
unsigned int mat_idx = m_mat_indices[i];
8469a6: 4d 8b b0 b8 00 00 00 mov 0xb8(%r8),%r14
{ return *(this->_M_impl._M_start + __n); }
8469ad: 4d 8b 98 d8 00 00 00 mov 0xd8(%r8),%r11
(m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
8469b4: 4d 8b 48 60 mov 0x60(%r8),%r9
8469b8: 4f 8d 54 6d 00 lea 0x0(%r13,%r13,2),%r10
unsigned int mat_idx = m_mat_indices[i];
8469bd: 43 8b 0c ae mov (%r14,%r13,4),%ecx
8469c1: 48 69 d1 38 02 00 00 imul $0x238,%rcx,%rdx
}
// static inline void
static void
update_density(const sim_constants &sc, density& d, real e, real *p_t) {
d = sc.A1 * (d + sc.d_in) - sc.d_in;
8469c8: 4d 8d b4 13 08 02 00 lea 0x208(%r11,%rdx,1),%r14
8469cf: 00
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
8469d0: 4c 89 76 10 mov %r14,0x10(%rsi)
8469d4: 49 8d 94 13 90 01 00 lea 0x190(%r11,%rdx,1),%rdx
8469db: 00
8469dc: 48 89 16 mov %rdx,(%rsi)
8469df: 48 8d 94 24 68 01 00 lea 0x168(%rsp),%rdx
8469e6: 00
(m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
8469e7: 4f 8d 2c d1 lea (%r9,%r10,8),%r13
8469eb: 4c 89 6e 08 mov %r13,0x8(%rsi)
8469ef: e8 6c cc da ff callq 5f3660 <void Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<double, 3, 1, 0, 3, 1>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1>, Eigen::internal::assign_op<double, double> >(Eigen::Matrix<double, 3, 1, 0, 3, 1>&, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1> const&, Eigen::internal::assign_op<double, double> const&)@plt>
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
8469f4: 48 8b 54 24 08 mov 0x8(%rsp),%rdx
#pragma omp for schedule(static)
8469f9: ff c3 inc %ebx
SrcEvaluatorType srcEvaluator(src);
8469fb: 4c 89 74 24 30 mov %r14,0x30(%rsp)
return _mm_loadu_pd(from);
846a00: c5 f9 10 02 vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
846a04: c4 c1 79 5c 0e vsubpd (%r14),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
846a09: c4 c1 79 11 4d 00 vmovupd %xmm1,0x0(%r13)
Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
846a0f: 48 8b 4c 24 08 mov 0x8(%rsp),%rcx
846a14: 48 8b 74 24 30 mov 0x30(%rsp),%rsi
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
846a19: c5 fb 10 51 10 vmovsd 0x10(%rcx),%xmm2
846a1e: c5 eb 5c 5e 10 vsubsd 0x10(%rsi),%xmm2,%xmm3
846a23: c4 c1 7b 11 5d 10 vmovsd %xmm3,0x10(%r13)
846a29: 3b 9c 24 e8 00 00 00 cmp 0xe8(%rsp),%ebx
846a30: 0f 82 57 ff ff ff jb 84698d <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xafd>
846a36: 44 8b b4 24 f0 00 00 mov 0xf0(%rsp),%r14d
846a3d: 00
846a3e: 45 33 ed xor %r13d,%r13d
846a41: 48 8d 3d 4c aa 37 00 lea 0x37aa4c(%rip),%rdi # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
846a48: 8b b4 24 60 01 00 00 mov 0x160(%rsp),%esi
846a4f: e8 9c 24 de ff callq 628ef0 <__kmpc_for_static_fini@plt>
| Mail converted by MHonArc 2.6.19+ | http://listengine.tuxfamily.org/ |