Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG

Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE

[ Thread Index | Date Index | More lists.tuxfamily.org/eigen Archives ]

To: eigen@xxxxxxxxxxxxxxxxxxx
Subject: Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
From: Michael Riesch <michael.riesch@xxxxxx>
Date: Fri, 15 Mar 2019 09:16:07 +0100
Authentication-results: postout.lrz.de (amavisd-new); dkim=pass (2048-bit key) reason="pass (just generated, assumed good)" header.d=tum.de
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/simple; d=tum.de; h= content-type:content-type:in-reply-to:mime-version:user-agent :date:date:message-id:from:from:references:subject:subject :received:received; s=postout; t=1552637767; bh=FY4hiQ3eN/imCNW0 c2wq/q6Kcxw50vg6JGd7s1ITZ7g=; b=oRPW05SJt38qDisAAe96926CZXvDFoaQ RipsTbPvUFrmrA4MzMKhWGIpPJi7BKytydQAlaUvt4nvRpQHkp7lZ36UlAFL8GQn 3g4dA2kfKtIp1czD8531kefBkXZ5IKlZnuRxz/tL5qeN7W22VXnVrlAbpbYIIplL Pc5ah0/ebDH2/6eKTs2pQ63MWVZAgdfvCPPxTFZOYh6XmpiWsnT3OHioqwBa9KVA M4Hv/j2Ys9wg/zwE96Nh7oNwF7ChDRJpZRHDK954bx0yUbtph6gdu/usMctb8vGy zxL796AeRZ6ShHal1+eJm8KNwRrj2Gn5ISdvTu6Zbyij6S5vFj2hkA==

Hello Christoph and Gael,

I tried to extract the relevant assembly and attached it. In the file eigen_forceinline.txt is the critical part generated by default, the file eigen_inline.txt is the same but with EIGEN_STRONG_INLINE set to "inline". The compiler options are in both cases -O3 -g -DNDEBUG -fPIC -xHost -qopenmp -std=gnu++11

Thanks and regards,
Michael

PS.:

> I think "Science" is the best fit, since mbsolve [1] "is an open-source
> solver tool for the Maxwell-Bloch equations, which are used to model
> light-matter interaction in nonlinear optics."

Done!

Thank you :-)

#pragma omp for schedule(static)
  837c9b:	41 ff cf             	dec    %r15d
  837c9e:	bb 01 00 00 00       	mov    $0x1,%ebx
  837ca3:	44 89 ac 24 20 01 00 	mov    %r13d,0x120(%rsp)
  837caa:	00
  837cab:	48 8d 3d 22 77 37 00 	lea    0x377722(%rip),%rdi        # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
  837cb2:	44 89 bc 24 24 01 00 	mov    %r15d,0x124(%rsp)
  837cb9:	00
  837cba:	ba 22 00 00 00       	mov    $0x22,%edx
  837cbf:	44 89 ac 24 28 01 00 	mov    %r13d,0x128(%rsp)
  837cc6:	00
  837cc7:	89 9c 24 2c 01 00 00 	mov    %ebx,0x12c(%rsp)
  837cce:	48 83 c4 e0          	add    $0xffffffffffffffe0,%rsp
  837cd2:	48 8d 84 24 4c 01 00 	lea    0x14c(%rsp),%rax
  837cd9:	00
  837cda:	48 8d 8c 24 48 01 00 	lea    0x148(%rsp),%rcx
  837ce1:	00
  837ce2:	4c 8d 84 24 40 01 00 	lea    0x140(%rsp),%r8
  837ce9:	00
  837cea:	48 89 04 24          	mov    %rax,(%rsp)
  837cee:	4c 8d 8c 24 44 01 00 	lea    0x144(%rsp),%r9
  837cf5:	00
  837cf6:	89 5c 24 08          	mov    %ebx,0x8(%rsp)
  837cfa:	89 5c 24 10          	mov    %ebx,0x10(%rsp)
  837cfe:	8b 70 3c             	mov    0x3c(%rax),%esi
  837d01:	e8 3a 24 db ff       	callq  5ea140 <__kmpc_for_static_init_4u@plt>
  837d06:	48 83 c4 20          	add    $0x20,%rsp
  837d0a:	8b 84 24 20 01 00 00 	mov    0x120(%rsp),%eax
  837d11:	8b 94 24 24 01 00 00 	mov    0x124(%rsp),%edx
  837d18:	41 3b c7             	cmp    %r15d,%eax
  837d1b:	0f 87 ca 00 00 00    	ja     837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
  837d21:	41 3b d7             	cmp    %r15d,%edx
  837d24:	44 0f 42 fa          	cmovb  %edx,%r15d
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  837d28:	41 3b c7             	cmp    %r15d,%eax
  837d2b:	0f 87 ba 00 00 00    	ja     837deb <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
#pragma omp for schedule(static)
  837d31:	44 2b f8             	sub    %eax,%r15d
  837d34:	44 89 eb             	mov    %r13d,%ebx
  837d37:	44 89 b4 24 f8 00 00 	mov    %r14d,0xf8(%rsp)
  837d3e:	00
  837d3f:	41 ff c7             	inc    %r15d
  837d42:	41 89 c6             	mov    %eax,%r14d
                unsigned int mat_idx = m_mat_indices[i];
  837d45:	4d 8b 04 24          	mov    (%r12),%r8
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  837d49:	45 8d 2c 1e          	lea    (%r14,%rbx,1),%r13d
  837d4d:	4d 63 ed             	movslq %r13d,%r13
  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
  837d50:	48 8d 3c 24          	lea    (%rsp),%rdi
  837d54:	48 8d 74 24 40       	lea    0x40(%rsp),%rsi
                unsigned int mat_idx = m_mat_indices[i];
  837d59:	49 8b 90 b8 00 00 00 	mov    0xb8(%r8),%rdx
      { return *(this->_M_impl._M_start + __n); }
  837d60:	4d 8b 98 d8 00 00 00 	mov    0xd8(%r8),%r11
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  837d67:	4d 8b 48 60          	mov    0x60(%r8),%r9
  837d6b:	4f 8d 54 6d 00       	lea    0x0(%r13,%r13,2),%r10
                unsigned int mat_idx = m_mat_indices[i];
  837d70:	42 8b 0c aa          	mov    (%rdx,%r13,4),%ecx
  837d74:	48 69 d1 38 02 00 00 	imul   $0x238,%rcx,%rdx
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  837d7b:	4f 8d 2c d1          	lea    (%r9,%r10,8),%r13
    }

    //    static inline void
    static void
    update_density(const sim_constants &sc, density& d, real e, real *p_t) {
        d = sc.A1 * (d + sc.d_in) - sc.d_in;
  837d7f:	4c 89 6e 08          	mov    %r13,0x8(%rsi)
  837d83:	49 8d 8c 13 08 02 00 	lea    0x208(%r11,%rdx,1),%rcx
  837d8a:	00
  837d8b:	48 89 4e 10          	mov    %rcx,0x10(%rsi)
  837d8f:	48 89 4e 20          	mov    %rcx,0x20(%rsi)
  837d93:	49 8d 94 13 90 01 00 	lea    0x190(%r11,%rdx,1),%rdx
  837d9a:	00
  837d9b:	48 89 16             	mov    %rdx,(%rsi)
  837d9e:	e8 dd 96 db ff       	callq  5f1480 <Eigen::internal::binary_evaluator<Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, Eigen::internal::IndexBased, Eigen::internal::IndexBased, double, double>::binary_evaluator(Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double, double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const> const&)@plt>
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  837da3:	48 8b 54 24 08       	mov    0x8(%rsp),%rdx
#pragma omp for schedule(static)
  837da8:	ff c3                	inc    %ebx
  837daa:	48 8b 4c 24 30       	mov    0x30(%rsp),%rcx
  837daf:	41 3b df             	cmp    %r15d,%ebx
  return _mm_loadu_pd(from);
  837db2:	c5 f9 10 02          	vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
  837db6:	c5 f9 5c 09          	vsubpd (%rcx),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
  837dba:	c4 c1 79 11 4d 00    	vmovupd %xmm1,0x0(%r13)
  837dc0:	48 8b 74 24 08       	mov    0x8(%rsp),%rsi
  837dc5:	48 8b 7c 24 30       	mov    0x30(%rsp),%rdi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
  837dca:	c5 fb 10 56 10       	vmovsd 0x10(%rsi),%xmm2
  837dcf:	c5 eb 5c 5f 10       	vsubsd 0x10(%rdi),%xmm2,%xmm3
  837dd4:	c4 c1 7b 11 5d 10    	vmovsd %xmm3,0x10(%r13)
  837dda:	0f 82 65 ff ff ff    	jb     837d45 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xb05>
  837de0:	44 8b b4 24 f8 00 00 	mov    0xf8(%rsp),%r14d
  837de7:	00
  837de8:	45 33 ed             	xor    %r13d,%r13d
  837deb:	48 8d 3d e2 75 37 00 	lea    0x3775e2(%rip),%rdi        # baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
  837df2:	8b b4 24 68 01 00 00 	mov    0x168(%rsp),%esi
  837df9:	e8 92 a7 de ff       	callq  622590 <__kmpc_for_static_fini@plt>

#pragma omp for schedule(static)
  8468db:	41 ff cf             	dec    %r15d
  8468de:	bb 01 00 00 00       	mov    $0x1,%ebx
  8468e3:	44 89 ac 24 18 01 00 	mov    %r13d,0x118(%rsp)
  8468ea:	00
  8468eb:	48 8d 3d a2 ab 37 00 	lea    0x37aba2(%rip),%rdi        # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
  8468f2:	44 89 bc 24 1c 01 00 	mov    %r15d,0x11c(%rsp)
  8468f9:	00
  8468fa:	ba 22 00 00 00       	mov    $0x22,%edx
  8468ff:	44 89 ac 24 20 01 00 	mov    %r13d,0x120(%rsp)
  846906:	00
  846907:	89 9c 24 24 01 00 00 	mov    %ebx,0x124(%rsp)
  84690e:	48 83 c4 e0          	add    $0xffffffffffffffe0,%rsp
  846912:	48 8d 84 24 44 01 00 	lea    0x144(%rsp),%rax
  846919:	00
  84691a:	48 8d 8c 24 40 01 00 	lea    0x140(%rsp),%rcx
  846921:	00
  846922:	4c 8d 84 24 38 01 00 	lea    0x138(%rsp),%r8
  846929:	00
  84692a:	48 89 04 24          	mov    %rax,(%rsp)
  84692e:	4c 8d 8c 24 3c 01 00 	lea    0x13c(%rsp),%r9
  846935:	00
  846936:	89 5c 24 08          	mov    %ebx,0x8(%rsp)
  84693a:	89 5c 24 10          	mov    %ebx,0x10(%rsp)
  84693e:	8b 70 3c             	mov    0x3c(%rax),%esi
  846941:	e8 7a a0 da ff       	callq  5f09c0 <__kmpc_for_static_init_4u@plt>
  846946:	48 83 c4 20          	add    $0x20,%rsp
  84694a:	8b 84 24 18 01 00 00 	mov    0x118(%rsp),%eax
  846951:	8b 94 24 1c 01 00 00 	mov    0x11c(%rsp),%edx
  846958:	41 3b c7             	cmp    %r15d,%eax
  84695b:	0f 87 e0 00 00 00    	ja     846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
  846961:	41 3b d7             	cmp    %r15d,%edx
  846964:	44 0f 42 fa          	cmovb  %edx,%r15d
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  846968:	41 3b c7             	cmp    %r15d,%eax
  84696b:	0f 87 d0 00 00 00    	ja     846a41 <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
#pragma omp for schedule(static)
  846971:	44 2b f8             	sub    %eax,%r15d
  846974:	44 89 eb             	mov    %r13d,%ebx
  846977:	41 ff c7             	inc    %r15d
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
  84697a:	44 89 bc 24 e8 00 00 	mov    %r15d,0xe8(%rsp)
  846981:	00
  846982:	41 89 c7             	mov    %eax,%r15d
  846985:	44 89 b4 24 f0 00 00 	mov    %r14d,0xf0(%rsp)
  84698c:	00
                unsigned int mat_idx = m_mat_indices[i];
  84698d:	4d 8b 04 24          	mov    (%r12),%r8
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  846991:	45 8d 2c 1f          	lea    (%r15,%rbx,1),%r13d
  846995:	4d 63 ed             	movslq %r13d,%r13
  SrcEvaluatorType srcEvaluator(src);
  846998:	48 8d 7c 24 18       	lea    0x18(%rsp),%rdi
  84699d:	48 89 7f f0          	mov    %rdi,-0x10(%rdi)
    call_dense_assignment_loop(dst, src, func);
  8469a1:	48 8d 74 24 68       	lea    0x68(%rsp),%rsi
                unsigned int mat_idx = m_mat_indices[i];
  8469a6:	4d 8b b0 b8 00 00 00 	mov    0xb8(%r8),%r14
      { return *(this->_M_impl._M_start + __n); }
  8469ad:	4d 8b 98 d8 00 00 00 	mov    0xd8(%r8),%r11
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  8469b4:	4d 8b 48 60          	mov    0x60(%r8),%r9
  8469b8:	4f 8d 54 6d 00       	lea    0x0(%r13,%r13,2),%r10
                unsigned int mat_idx = m_mat_indices[i];
  8469bd:	43 8b 0c ae          	mov    (%r14,%r13,4),%ecx
  8469c1:	48 69 d1 38 02 00 00 	imul   $0x238,%rcx,%rdx
    }

    //    static inline void
    static void
    update_density(const sim_constants &sc, density& d, real e, real *p_t) {
        d = sc.A1 * (d + sc.d_in) - sc.d_in;
  8469c8:	4d 8d b4 13 08 02 00 	lea    0x208(%r11,%rdx,1),%r14
  8469cf:	00
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  8469d0:	4c 89 76 10          	mov    %r14,0x10(%rsi)
  8469d4:	49 8d 94 13 90 01 00 	lea    0x190(%r11,%rdx,1),%rdx
  8469db:	00
  8469dc:	48 89 16             	mov    %rdx,(%rsi)
  8469df:	48 8d 94 24 68 01 00 	lea    0x168(%rsp),%rdx
  8469e6:	00
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  8469e7:	4f 8d 2c d1          	lea    (%r9,%r10,8),%r13
  8469eb:	4c 89 6e 08          	mov    %r13,0x8(%rsi)
  8469ef:	e8 6c cc da ff       	callq  5f3660 <void Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<double, 3, 1, 0, 3, 1>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1>, Eigen::internal::assign_op<double, double> >(Eigen::Matrix<double, 3, 1, 0, 3, 1>&, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 1> const&, Eigen::internal::assign_op<double, double> const&)@plt>
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  8469f4:	48 8b 54 24 08       	mov    0x8(%rsp),%rdx
#pragma omp for schedule(static)
  8469f9:	ff c3                	inc    %ebx
  SrcEvaluatorType srcEvaluator(src);
  8469fb:	4c 89 74 24 30       	mov    %r14,0x30(%rsp)
  return _mm_loadu_pd(from);
  846a00:	c5 f9 10 02          	vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
  846a04:	c4 c1 79 5c 0e       	vsubpd (%r14),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
  846a09:	c4 c1 79 11 4d 00    	vmovupd %xmm1,0x0(%r13)
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  846a0f:	48 8b 4c 24 08       	mov    0x8(%rsp),%rcx
  846a14:	48 8b 74 24 30       	mov    0x30(%rsp),%rsi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
  846a19:	c5 fb 10 51 10       	vmovsd 0x10(%rcx),%xmm2
  846a1e:	c5 eb 5c 5e 10       	vsubsd 0x10(%rsi),%xmm2,%xmm3
  846a23:	c4 c1 7b 11 5d 10    	vmovsd %xmm3,0x10(%r13)
  846a29:	3b 9c 24 e8 00 00 00 	cmp    0xe8(%rsp),%ebx
  846a30:	0f 82 57 ff ff ff    	jb     84698d <mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xafd>
  846a36:	44 8b b4 24 f0 00 00 	mov    0xf0(%rsp),%r14d
  846a3d:	00
  846a3e:	45 33 ed             	xor    %r13d,%r13d
  846a41:	48 8d 3d 4c aa 37 00 	lea    0x37aa4c(%rip),%rdi        # bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
  846a48:	8b b4 24 60 01 00 00 	mov    0x160(%rsp),%esi
  846a4f:	e8 9c 24 de ff       	callq  628ef0 <__kmpc_for_static_fini@plt>

Follow-Ups:
- Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Gael Guennebaud

References:
- [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Michael Riesch
- Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Christoph Hertzberg
- Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Michael Riesch
- Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Christoph Hertzberg
- Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
  - From: Gael Guennebaud

Messages sorted by: [ date | thread ]
Prev by Date: Re: [eigen] Call for survey questions
Next by Date: Re: [eigen] Using Eigen in a project built with CMake
Previous by thread: Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE
Next by thread: Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE

Mail converted by MHonArc 2.6.19+

http://listengine.tuxfamily.org/