Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE |
[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]
Hello Christoph and Gael, I tried to extract the relevant assembly and attached it. In the file eigen_forceinline.txt is the critical part generated by default, the file eigen_inline.txt is the same but with EIGEN_STRONG_INLINE set to "inline". The compiler options are in both cases -O3 -g -DNDEBUG -fPIC -xHost -qopenmp -std=gnu++11 Thanks and regards, PS.: Thank you :-) |
#pragma omp for schedule(static) 837c9b: 41 ff cf dec %r15d 837c9e: bb 01 00 00 00 mov $0x1,%ebx 837ca3: 44 89 ac 24 20 01 00 mov %r13d,0x120(%rsp) 837caa: 00 837cab: 48 8d 3d 22 77 37 00 lea 0x377722(%rip),%rdi # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092> 837cb2: 44 89 bc 24 24 01 00 mov %r15d,0x124(%rsp) 837cb9: 00 837cba: ba 22 00 00 00 mov $0x22,%edx 837cbf: 44 89 ac 24 28 01 00 mov %r13d,0x128(%rsp) 837cc6: 00 837cc7: 89 9c 24 2c 01 00 00 mov %ebx,0x12c(%rsp) 837cce: 48 83 c4 e0 add $0xffffffffffffffe0,%rsp 837cd2: 48 8d 84 24 4c 01 00 lea 0x14c(%rsp),%rax 837cd9: 00 837cda: 48 8d 8c 24 48 01 00 lea 0x148(%rsp),%rcx 837ce1: 00 837ce2: 4c 8d 84 24 40 01 00 lea 0x140(%rsp),%r8 837ce9: 00 837cea: 48 89 04 24 mov %rax,(%rsp) 837cee: 4c 8d 8c 24 44 01 00 lea 0x144(%rsp),%r9 837cf5: 00 837cf6: 89 5c 24 08 mov %ebx,0x8(%rsp) 837cfa: 89 5c 24 10 mov %ebx,0x10(%rsp) 837cfe: 8b 70 3c mov 0x3c(%rax),%esi 837d01: e8 3a 24 db ff callq 5ea140 <__kmpc_for_static_init_4u@plt> 837d06: 48 83 c4 20 add $0x20,%rsp 837d0a: 8b 84 24 20 01 00 00 mov 0x120(%rsp),%eax 837d11: 8b 94 24 24 01 00 00 mov 0x124(%rsp),%edx 837d18: 41 3b c7 cmp %r15d,%eax 837d1b: 0f 87 ca 00 00 00 ja 837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab> 837d21: 41 3b d7 cmp %r15d,%edx 837d24: 44 0f 42 fa cmovb %edx,%r15d for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) { 837d28: 41 3b c7 cmp %r15d,%eax 837d2b: 0f 87 ba 00 00 00 ja 837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab> #pragma omp for schedule(static) 837d31: 44 2b f8 sub %eax,%r15d 837d34: 44 89 eb mov %r13d,%ebx 837d37: 44 89 b4 24 f8 00 00 mov %r14d,0xf8(%rsp) 837d3e: 00 837d3f: 41 ff c7 inc %r15d 837d42: 41 89 c6 mov %eax,%r14d unsigned int mat_idx = m_mat_indices[i]; 837d45: 4d 8b 04 24 mov (%r12),%r8 for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) { 837d49: 45 8d 2c 1e lea (%r14,%rbx,1),%r13d 837d4d: 4d 63 ed movslq %r13d,%r13 EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) 837d50: 48 8d 3c 24 lea (%rsp),%rdi 837d54: 48 8d 74 24 40 lea 0x40(%rsp),%rsi unsigned int mat_idx = m_mat_indices[i]; 837d59: 49 8b 90 b8 00 00 00 mov 0xb8(%r8),%rdx { return *(this->_M_impl._M_start + __n); } 837d60: 4d 8b 98 d8 00 00 00 mov 0xd8(%r8),%r11 (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]); 837d67: 4d 8b 48 60 mov 0x60(%r8),%r9 837d6b: 4f 8d 54 6d 00 lea 0x0(%r13,%r13,2),%r10 unsigned int mat_idx = m_mat_indices[i]; 837d70: 42 8b 0c aa mov (%rdx,%r13,4),%ecx 837d74: 48 69 d1 38 02 00 00 imul $0x238,%rcx,%rdx (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]); 837d7b: 4f 8d 2c d1 lea (%r9,%r10,8),%r13 } // static inline void static void update_density(const sim_constants &sc, density& d, real e, real *p_t) { d = sc.A1 * (d + sc.d_in) - sc.d_in; 837d7f: 4c 89 6e 08 mov %r13,0x8(%rsi) 837d83: 49 8d 8c 13 08 02 00 lea 0x208(%r11,%rdx,1),%rcx 837d8a: 00 837d8b: 48 89 4e 10 mov %rcx,0x10(%rsi) 837d8f: 48 89 4e 20 mov %rcx,0x20(%rsi) 837d93: 49 8d 94 13 90 01 00 lea 0x190(%r11,%rdx,1),%rdx 837d9a: 00 837d9b: 48 89 16 mov %rdx,(%rsi) 837d9e: e8 dd 96 db ff callq 5f1480 <Eigen::internal::binary_evaluator<Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, Eigen::internal::IndexBased, Eigen::internal::IndexBased, double, double>::binary_evaluator(Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const> const&)@plt> Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); 837da3: 48 8b 54 24 08 mov 0x8(%rsp),%rdx #pragma omp for schedule(static) 837da8: ff c3 inc %ebx 837daa: 48 8b 4c 24 30 mov 0x30(%rsp),%rcx 837daf: 41 3b df cmp %r15d,%ebx return _mm_loadu_pd(from); 837db2: c5 f9 10 02 vmovupd (%rdx),%xmm0 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } 837db6: c5 f9 5c 09 vsubpd (%rcx),%xmm0,%xmm1 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } 837dba: c4 c1 79 11 4d 00 vmovupd %xmm1,0x0(%r13) 837dc0: 48 8b 74 24 08 mov 0x8(%rsp),%rsi 837dc5: 48 8b 7c 24 30 mov 0x30(%rsp),%rdi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; } 837dca: c5 fb 10 56 10 vmovsd 0x10(%rsi),%xmm2 837dcf: c5 eb 5c 5f 10 vsubsd 0x10(%rdi),%xmm2,%xmm3 837dd4: c4 c1 7b 11 5d 10 vmovsd %xmm3,0x10(%r13) 837dda: 0f 82 65 ff ff ff jb 837d45 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xb05> 837de0: 44 8b b4 24 f8 00 00 mov 0xf8(%rsp),%r14d 837de7: 00 837de8: 45 33 ed xor %r13d,%r13d 837deb: 48 8d 3d e2 75 37 00 lea 0x3775e2(%rip),%rdi # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092> 837df2: 8b b4 24 68 01 00 00 mov 0x168(%rsp),%esi 837df9: e8 92 a7 de ff callq 622590 <__kmpc_for_static_fini@plt>
#pragma omp for schedule(static) 8468db: 41 ff cf dec %r15d 8468de: bb 01 00 00 00 mov $0x1,%ebx 8468e3: 44 89 ac 24 18 01 00 mov %r13d,0x118(%rsp) 8468ea: 00 8468eb: 48 8d 3d a2 ab 37 00 lea 0x37aba2(%rip),%rdi # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251> 8468f2: 44 89 bc 24 1c 01 00 mov %r15d,0x11c(%rsp) 8468f9: 00 8468fa: ba 22 00 00 00 mov $0x22,%edx 8468ff: 44 89 ac 24 20 01 00 mov %r13d,0x120(%rsp) 846906: 00 846907: 89 9c 24 24 01 00 00 mov %ebx,0x124(%rsp) 84690e: 48 83 c4 e0 add $0xffffffffffffffe0,%rsp 846912: 48 8d 84 24 44 01 00 lea 0x144(%rsp),%rax 846919: 00 84691a: 48 8d 8c 24 40 01 00 lea 0x140(%rsp),%rcx 846921: 00 846922: 4c 8d 84 24 38 01 00 lea 0x138(%rsp),%r8 846929: 00 84692a: 48 89 04 24 mov %rax,(%rsp) 84692e: 4c 8d 8c 24 3c 01 00 lea 0x13c(%rsp),%r9 846935: 00 846936: 89 5c 24 08 mov %ebx,0x8(%rsp) 84693a: 89 5c 24 10 mov %ebx,0x10(%rsp) 84693e: 8b 70 3c mov 0x3c(%rax),%esi 846941: e8 7a a0 da ff callq 5f09c0 <__kmpc_for_static_init_4u@plt> 846946: 48 83 c4 20 add $0x20,%rsp 84694a: 8b 84 24 18 01 00 00 mov 0x118(%rsp),%eax 846951: 8b 94 24 1c 01 00 00 mov 0x11c(%rsp),%edx 846958: 41 3b c7 cmp %r15d,%eax 84695b: 0f 87 e0 00 00 00 ja 846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1> 846961: 41 3b d7 cmp %r15d,%edx 846964: 44 0f 42 fa cmovb %edx,%r15d for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) { 846968: 41 3b c7 cmp %r15d,%eax 84696b: 0f 87 d0 00 00 00 ja 846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1> #pragma omp for schedule(static) 846971: 44 2b f8 sub %eax,%r15d 846974: 44 89 eb mov %r13d,%ebx 846977: 41 ff c7 inc %r15d EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } 84697a: 44 89 bc 24 e8 00 00 mov %r15d,0xe8(%rsp) 846981: 00 846982: 41 89 c7 mov %eax,%r15d 846985: 44 89 b4 24 f0 00 00 mov %r14d,0xf0(%rsp) 84698c: 00 unsigned int mat_idx = m_mat_indices[i]; 84698d: 4d 8b 04 24 mov (%r12),%r8 for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) { 846991: 45 8d 2c 1f lea (%r15,%rbx,1),%r13d 846995: 4d 63 ed movslq %r13d,%r13 SrcEvaluatorType srcEvaluator(src); 846998: 48 8d 7c 24 18 lea 0x18(%rsp),%rdi 84699d: 48 89 7f f0 mov %rdi,-0x10(%rdi) call_dense_assignment_loop(dst, src, func); 8469a1: 48 8d 74 24 68 lea 0x68(%rsp),%rsi unsigned int mat_idx = m_mat_indices[i]; 8469a6: 4d 8b b0 b8 00 00 00 mov 0xb8(%r8),%r14 { return *(this->_M_impl._M_start + __n); } 8469ad: 4d 8b 98 d8 00 00 00 mov 0xd8(%r8),%r11 (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]); 8469b4: 4d 8b 48 60 mov 0x60(%r8),%r9 8469b8: 4f 8d 54 6d 00 lea 0x0(%r13,%r13,2),%r10 unsigned int mat_idx = m_mat_indices[i]; 8469bd: 43 8b 0c ae mov (%r14,%r13,4),%ecx 8469c1: 48 69 d1 38 02 00 00 imul $0x238,%rcx,%rdx } // static inline void static void update_density(const sim_constants &sc, density& d, real e, real *p_t) { d = sc.A1 * (d + sc.d_in) - sc.d_in; 8469c8: 4d 8d b4 13 08 02 00 lea 0x208(%r11,%rdx,1),%r14 8469cf: 00 call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>()); 8469d0: 4c 89 76 10 mov %r14,0x10(%rsi) 8469d4: 49 8d 94 13 90 01 00 lea 0x190(%r11,%rdx,1),%rdx 8469db: 00 8469dc: 48 89 16 mov %rdx,(%rsi) 8469df: 48 8d 94 24 68 01 00 lea 0x168(%rsp),%rdx 8469e6: 00 (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]); 8469e7: 4f 8d 2c d1 lea (%r9,%r10,8),%r13 8469eb: 4c 89 6e 08 mov %r13,0x8(%rsi) 8469ef: e8 6c cc da ff callq 5f3660 <void Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<double, 3, 1, 0, 3, 1>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1>, Eigen::internal::assign_op<double, double> >(Eigen::Matrix<double, 3, 1, 0, 3, 1>&, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1> const&, Eigen::internal::assign_op<double, double> const&)@plt> Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); 8469f4: 48 8b 54 24 08 mov 0x8(%rsp),%rdx #pragma omp for schedule(static) 8469f9: ff c3 inc %ebx SrcEvaluatorType srcEvaluator(src); 8469fb: 4c 89 74 24 30 mov %r14,0x30(%rsp) return _mm_loadu_pd(from); 846a00: c5 f9 10 02 vmovupd (%rdx),%xmm0 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } 846a04: c4 c1 79 5c 0e vsubpd (%r14),%xmm0,%xmm1 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } 846a09: c4 c1 79 11 4d 00 vmovupd %xmm1,0x0(%r13) Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); 846a0f: 48 8b 4c 24 08 mov 0x8(%rsp),%rcx 846a14: 48 8b 74 24 30 mov 0x30(%rsp),%rsi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; } 846a19: c5 fb 10 51 10 vmovsd 0x10(%rcx),%xmm2 846a1e: c5 eb 5c 5e 10 vsubsd 0x10(%rsi),%xmm2,%xmm3 846a23: c4 c1 7b 11 5d 10 vmovsd %xmm3,0x10(%r13) 846a29: 3b 9c 24 e8 00 00 00 cmp 0xe8(%rsp),%ebx 846a30: 0f 82 57 ff ff ff jb 84698d <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xafd> 846a36: 44 8b b4 24 f0 00 00 mov 0xf0(%rsp),%r14d 846a3d: 00 846a3e: 45 33 ed xor %r13d,%r13d 846a41: 48 8d 3d 4c aa 37 00 lea 0x37aa4c(%rip),%rdi # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251> 846a48: 8b b4 24 60 01 00 00 mov 0x160(%rsp),%esi 846a4f: e8 9c 24 de ff callq 628ef0 <__kmpc_for_static_fini@plt>
Mail converted by MHonArc 2.6.19+ | http://listengine.tuxfamily.org/ |