Question

У меня есть цель процессора ARM NEON Cortex-A8. Я оптимизировал свой код, используя NEON. Но когда я компилирую свой код, я получаю эту странную ошибку. Не знаю, как это исправить.

Я пытаюсь скомпилировать следующий код (ЧАСТЬ 1), используя Code Sourcery (ЧАСТЬ 2) на моем хосте. И я получаю эту странную ошибку (ЧАСТЬ 3). Я что-то здесь не так делаю? Кто-нибудь еще может скомпилировать это и посмотреть, получат ли они ту же ошибку компиляции?

Странная часть: в коде, если я закомментирую часть кода else if(step_size == 4), ошибка исчезнет. Но, к сожалению, моя оптимизация не завершена, поэтому я должен иметь ее.

Сначала я подумал, что это проблема компилятора CodeSourcey (на моем хосте), поэтому я скомпилировал программу непосредственно на моей цели (моя цель работает на Ubuntu). Я использовал gcc там, и снова я получаю ту же ошибку, и когда я закомментирую часть else if(step_size == 4), ошибка исчезнет.

Помощь!

ЧАСТЬ 1

#include<stdio.h>
#include"arm_neon.h"

#define IMAGE_HEIGHT 480
#define IMAGE_WIDTH  640

float32_t integral_image[IMAGE_HEIGHT][IMAGE_WIDTH];

float32x4_t box_area_compute3(int, int , int , int , unsigned int , float);

inline int min(int, int);

int main()
{

 box_area_compute3(1, 1, 4, 4, 2, 0);

 return 0;
}

float32x4_t box_area_compute3(int row, int col, int num_rows, int num_cols, unsigned int step_size, float three)
{
 unsigned int height = IMAGE_HEIGHT;
 unsigned int width = IMAGE_WIDTH;

 int temp_row = row + num_rows;
 int temp_col = col + num_cols;

 int r1 = (min(row, height))- 1 ;
 int r2 = (min(temp_row, height)) - 1;

 int c1 = (min(col, width)) - 1;
 int c2 = (min(temp_col, width)) - 1;

 float32x4_t v128_areas;

 if(step_size == 2)
 {
  float32x4x2_t top_left, top_right, bottom_left, bottom_right;
  top_left    = vld2q_f32((float32_t *)integral_image[r1] + c1);
  top_right   = vld2q_f32((float32_t *)integral_image[r1] + c2);
  bottom_left  = vld2q_f32((float32_t *)integral_image[r2] + c1);
  bottom_right  = vld2q_f32((float32_t *)integral_image[r2] + c2);

  v128_areas = vsubq_f32(vsubq_f32(vaddq_f32(top_left.val[0], bottom_right.val[0]), top_right.val[0]), bottom_left.val[0]);


 }
 else if(step_size == 4)
 {
  float32x4x4_t top_left, top_right, bottom_left, bottom_right;
  top_left   = vld4q_f32((float32_t *)integral_image[r1] + c1);
  top_right   = vld4q_f32((float32_t *)integral_image[r1] + c2);
  bottom_left  = vld4q_f32((float32_t *)integral_image[r2] + c1);
  bottom_right  = vld4q_f32((float32_t *)integral_image[r2] + c2);

  v128_areas = vsubq_f32(vsubq_f32(vaddq_f32(top_left.val[0], bottom_right.val[0]), top_right.val[0]), bottom_left.val[0]);

 }

 if(three == 3.0)
  v128_areas = vmulq_n_f32(v128_areas, three);

 return v128_areas;

}

inline int min(int X, int Y)
{
 return (X < Y ? X : Y);
}

ЧАСТЬ 2

arm-none-linux-gnueabi-gcc -O0 -g3 -Wall -c -fmessage-length=0 -fcommon -MMD -MP -MF"main.d" -MT"main.d" -mcpu=cortex-a8 -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -o"main.o" "../main.c"

ЧАСТЬ 3

../main.c: In function 'box_area_compute3':
../main.c:65: error: unable to find a register to spill in class 'GENERAL_REGS'
../main.c:65: error: this is the insn:
(insn 226 225 227 5 c:\program files\codesourcery\sourcery g++\bin\../lib/gcc/arm-none-linux-gnueabi/4.4.1/include/arm_neon.h:9863 (parallel [
           (set (reg:XI 148 [ D.17028 ])
               (unspec:XI [
                       (mem:XI (reg:SI 3 r3 [301]) [0 S64 A64])
                       (reg:XI 148 [ D.17028 ])
                       (unspec:V4SF [
                               (const_int 0 [0x0])
                           ] 191)
                   ] 111))
           (set (reg:SI 3 r3 [301])
               (plus:SI (reg:SI 3 r3 [301])
                   (const_int 32 [0x20])))
       ]) 1605 {neon_vld4qav4sf} (nil))
../main.c:65: confused by earlier errors, bailing out
cs-make: *** [main.o] Error 1

nategoose · Answer 1 · 28 сентября 2010

Я не могу проверить это, потому что у меня нет для этого цепочки инструментов, но этот тип ошибки часто можно обойти, переписав код немного.Как правило, этого не должно происходить, и об этом следует сообщать как об ошибке, но вы используете специфическую для процессора функциональность, которая, вероятно, менее хорошо протестирована и отшлифована, чем остальная часть компилятора.

Поскольку это регистрошибка разлива, и у вас есть несколько указателей. Я очень подозреваю, что компилятор может пытаться загрузить больше данных в регистры, чем нужно, из-за страха, что может произойти некоторое наложение алиасинга (что, вероятно, на самом деле не происходит).Ниже я расскажу о возможности этого, а также сделаю несколько других вещей, которые могут уменьшить сложность кода с точки зрения компилятора (хотя это может выглядеть не так).

#include<stdio.h>
#include"arm_neon.h"

#define IMAGE_HEIGHT 480
#define IMAGE_WIDTH  640

float32_t integral_image[IMAGE_HEIGHT][IMAGE_WIDTH];

float32x4_t box_area_compute3(int, int , int , int , unsigned int , float);

inline int min(int, int);

int main()
{

 box_area_compute3(1, 1, 4, 4, 2, 0);

 return 0;
}

/* By putting these in separate functions the compiler will initially
 * think about them by themselves, without the complications of the
 * surrounding code.  This may give it the abiltiy to optimise the
 * code somewhat before trying to inline it.
 * This may also serve to make it more obvious to the compiler that
 * the local variables are dead after their use (since they are
 * dead after the call returns, and that the lifetimes of some variable
 * cannot actually overlap (hopefully reducing the register needs).
 */
static inline float32x4_t do_it2(float32_t *tl, float32_t *tr, float32_t *bl, float32_t * br) {
    float32x4x2_t top_left, top_right, bottom_left, bottom_right;
    float32x4_t A, B;

    top_left = vld2q_f32(tl);
    top_right = vld2q_f32(tr);
    bottom_left = vld2q_f32(bl);
    bottom_right = vld2q_f32(br);

    /* By spreading this across several statements I have created several
     * additional sequence points.  The compiler does not think that it
     * has to dereference all of the pointers before doing any of the
     * computations.... maybe. */
    A = vaddq_f32(*top_left.val, *bottom_right.val);
    B = vsubq_f32(A, *top_right.val);
    return vsubq_f32(B, *bottom_left);
}

static inline float32x4_t do_it4(float32_t *tl, float32_t *tr, float32_t *bl, float32_t * br) {
    float32x4x4_t top_left, top_right, bottom_left, bottom_right;
    float32x4_t A, B;

    top_left = vld4q_f32(tl);
    top_right = vld4q_f32(tr);
    bottom_left = vld4q_f32(bl);
    bottom_right = vld4q_f32(br);

    A = vaddq_f32(*top_left.val, *bottom_right.val);
    B = vsubq_f32(A, *top_right.val);
    return vsubq_f32(B, *bottom_left);
}

float32x4_t box_area_compute3(int row, int col, int num_rows, int num_cols, unsigned int step_size, float three)
{
 unsigned int height = IMAGE_HEIGHT;
 unsigned int width = IMAGE_WIDTH;

 int temp_row = row + num_rows;
 int temp_col = col + num_cols;

 int r1 = (min(row, height))- 1 ;
 int r2 = (min(temp_row, height)) - 1;

 int c1 = (min(col, width)) - 1;
 int c2 = (min(temp_col, width)) - 1;

 float32x4_t v128_areas;

     float32_t *tl = (float32_t *)integral_image[r1] + c1;
 float32_t *tr = (float32_t *)integral_image[r1] + c2;
 float32_t *bl = (float32_t *)integral_image[r2] + c1;
 float32_t *br = (float32_t *)integral_image[r2] + c2;


 switch (step_size) {
    case 2:
      v128_areas = do_it2(tl, tr, bl, br);
      break;

 case 4:
      v128_areas = do_it4(tl, tr, bl, br);
      break;
 }

 if(three == 3.0)
  v128_areas = vmulq_n_f32(v128_areas, three);

 return v128_areas;

}

inline int min(int X, int Y)
{
 return (X < Y ? X : Y);
}

Я надеюсь, что это поможет, и что я не внес никаких ошибок.

HaggarTheHorrible · Answer 2 · 28 октября 2010

Ну, я связался с Code Sourcery об этой проблеме, и они сочли это ошибкой в компиляторе GCC.Поэтому я написал функцию do_it4 () {.....} в сборке вместо использования встроенных функций.Теперь это работает хорошо!

RTFM · Answer 3 · 20 апреля 2012

ARM NEON Cortex-A8 имеет поддержку vfpv3, Cortex-A5 имеет поддержку vfpv4 и neon2 (например: если вы используете -mfloat-abi = hard, вы пропускаете возможность эмулировать в программном обеспечении отсутствующие инструкции, поэтому вы не можете сгенерировать код, которыйбудет оптимизирован для vfpv4, но будет работать на vfpv3 с программной эмуляцией)

Tyson Jacobs · Answer 4 · 23 марта 2011

Строка:

float32x4x4_t top_left, top_right, bottom_left, bottom_right;

использует все 16 регистров q!Не удивительно, что компилятор не может справиться с этим.Возможно, вы могли бы исправить это, переписав, чтобы использовать меньше регистров.

Неизвестная ошибка GCC при компиляции для ARM NEON (критическая)

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 4 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Неизвестная ошибка GCC при компиляции для ARM NEON (критическая)

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 4 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы