Valgrind не работает на Arm64 и Intrinsics - PullRequest
0 голосов
/ 25 января 2019

Я построил матричную функцию умножения для векторов float16 (переданных как uint16) с использованием встроенных функций Arm NEON. При запуске самой программы она работает нормально, но при использовании valgrind / callgrind происходит сбой. Вот функция с тестовой программой:

#include <stdlib.h>
#include <math.h>
#include <arm_neon.h>
#include <stdio.h>
#include <vector>
#include <glm/vec3.hpp> // glm::vec3
#include <glm/vec4.hpp> // glm::vec4, glm::ivec4
#include <glm/mat4x4.hpp> // glm::mat4
#include <glm/gtc/matrix_transform.hpp> // glm::translate, glm::rotate, glm::scale, glm::perspective
#include <glm/gtc/type_ptr.hpp> // glm::value_ptr
#include <glm/gtc/packing.hpp>

void __attribute__ ((noinline))
transformVectorU16 ( glm::mat4 const & matrix,
                  std::vector < glm::u16vec4 > const & input,
                  std::vector < glm::vec4 >          & output )
{
   float32x4x4_t iMatrix = *(float32x4x4_t *)&matrix;
   float32x4_t rslt;

   std::vector < glm::u16vec4 >::const_iterator inVertexStart  = input.begin();
   std::vector < glm::u16vec4 >::const_iterator inVertexEnd    = input.end();
   std::vector < glm::vec4 >::iterator          outVertexStart = output.begin();

   for ( ; inVertexStart != inVertexEnd; inVertexStart++, outVertexStart++ )
   {
      const float16x4_t input_local = *( float16x4_t const * )&(*inVertexStart);
      const float32x4_t input_local_32 = vcvt_f32_f16( input_local );
      rslt = vmulq_f32(      iMatrix.val[0], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[1], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[2], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[3], input_local_32 );

      vst1q_f32( (float32_t*)&( *outVertexStart ), rslt);
   }
}


int main(int argc, char* argv[])
{ 
   glm::mat4 matrix( 1,0,0,13,
                     2,0,0,14,
                     3,0,0,15,
                     4,0,0,16 );

   union Convert
   {
      glm::uint16 val; 
      __fp16 t;
   } u[4];

   std::vector < glm::u16vec4 > c;
   std::vector < glm::vec4 > b;
   size_t num = rand() % 10000;
   for ( size_t i = 0; i < num; ++i )
   {
      u[0].t = float(c.size());
      u[1].t = float(c.size());
      u[2].t = float(c.size());
      u[3].t = float(c.size());
      c.push_back ( glm::u16vec4 ( u[0].val,
                                   u[1].val,
                                   u[2].val,
                                   u[3].val ) );
   }

   b.resize ( c.size() );
   transformVectorU16 ( matrix, c , b );
   for ( auto & bentry : b )
      printf("%f %f %f %f\n", bentry[0], bentry[1], bentry[2], bentry[3] );

   return 0;
} 

Это отчет о сбое:

==433== Memcheck, a memory error detector
==433== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==433== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==433== Command: ./vectortest
==433== 
t3 = GET:F16(338)
vex: the `impossible' happened:
   iselStmt
vex storage: T total 61973608 bytes allocated
vex storage: P total 0 bytes allocated

valgrind: the 'impossible' happened:
   LibVEX called failure_exit().

host stacktrace:
==433==    at 0x5803CF0C: show_sched_status_wrk (m_libcassert.c:355)
==433==    by 0x5803D043: report_and_quit (m_libcassert.c:426)
==433==    by 0x5803D277: panic (m_libcassert.c:502)
==433==    by 0x5803D277: vgPlain_core_panic_at (m_libcassert.c:507)
==433==    by 0x5803D297: vgPlain_core_panic (m_libcassert.c:512)
==433==    by 0x5805A907: failure_exit (m_translate.c:740)
==433==    by 0x5810BF33: vpanic (main_util.c:231)
==433==    by 0x58164E1F: iselStmt (host_arm64_isel.c:4003)
==433==    by 0x58164E1F: iselSB_ARM64 (host_arm64_isel.c:4201)
==433==    by 0x58109AFB: libvex_BackEnd (main_main.c:1047)
==433==    by 0x58109AFB: LibVEX_Translate (main_main.c:1174)
==433==    by 0x5805CFCB: vgPlain_translate (m_translate.c:1794)
==433==    by 0x58093DD7: handle_chain_me (scheduler.c:1084)
==433==    by 0x58095A2F: vgPlain_scheduler (scheduler.c:1428)
==433==    by 0x580A69A3: thread_wrapper (syswrap-linux.c:103)
==433==    by 0x580A69A3: run_a_thread_NORETURN (syswrap-linux.c:156)
==433==    by 0xFFFFFFFFFFFFFFFF: ???

sched status:
  running_tid=1

Thread 1: status = VgTs_Runnable (lwpid 433)
==433==    at 0x400C1C: vcvt_f32_f16 (arm_neon.h:14818)
==433==    by 0x400C1C: transformVectorU16(glm::tmat4x4<float, (glm::precision)0> const&, std::vector<glm::tvec4<unsigned short, (glm::precision)0>, std::allocator<glm::tvec4<unsigned short, (glm::precision)0> > > const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > >&) (vectortestcleaned.cpp:28)
==433==    by 0x400907: main (vectortestcleaned.cpp:68)


Note: see also the FAQ in the source distribution.
It contains workarounds to several common problems.
In particular, if Valgrind aborted or crashed after
identifying problems in your program, there's a good chance
that fixing those problems will prevent Valgrind aborting or
crashing, especially if it happened in m_mallocfree.c.

If that doesn't help, please report this bug to: www.valgrind.org

In the bug report, send all the above text, the valgrind
version, and what OS and version you are using.  Thanks.

Это сообщение об ошибке является загадочным для меня. Можете ли вы сказать мне, что не так?

Привет

...