Question

Profiler说，这个功能占总时间的50％。你会如何优化它？它将BMP颜色方案转换为YUV。谢谢！

更新：平台是ARMV6（为iPhone写作）

#define Y_FROM_RGB(_r_,_g_,_b_) ( (  66 * _b_ + 129 * _g_ +  25 * _r_ + 128) >> 8) + 16
#define V_FROM_RGB(_r_,_g_,_b_) ( ( 112 * _b_ -  94 * _g_ -  18 * _r_ + 128) >> 10) + 128
#define U_FROM_RGB(_r_,_g_,_b_) ( ( -38 * _b_ -  74 * _g_ + 112 * _r_ + 128) >> 10) + 128

  /*!
 * \brief
 * Converts 24 bit image to YCrCb image channels
 * 
 * \param source
 * Source 24bit image pointer
 * 
 * \param source_width
 * Source image width
 * 
 * \param dest_Y
 * destination image Y component pointer
 * 
 * \param dest_scan_size_Y
 * destination image Y component line size
 * 
 * \param dest_U
 * destination image U component pointer
 * 
 * \param dest_scan_size_U
 * destination image U component line size
 * 
 * \param dest_V
 * destination image V component pointer
 * 
 * \param dest_scan_size_V
 * destination image V component line size
 * 
 * \param dest_width
 * Destination image width = source_width
 * 
 * \param dest_height
 * Destination image height = source image height
 *
 * Convert 24 bit image (source) with width (source_width)
 * to YCrCb image channels (dest_Y, dest_U, dest_V) with size (dest_width)x(dest_height), and line size
 * (dest_scan_size_Y, dest_scan_size_U, dest_scan_size_V) (in bytes)
 * 
 */
void ImageConvert_24_YUV420P(unsigned char * source, int source_width,
                            unsigned char * dest_Y, int dest_scan_size_Y,
                            unsigned char * dest_U, int dest_scan_size_U,
                            unsigned char * dest_V, int dest_scan_size_V,
                            int dest_width, int dest_height)
{
  int source_scan_size = source_width*3;

  int half_width = dest_width/2;

  //Y loop
  for (int y = 0; y < dest_height/2; y ++)
  {
    //Start of line
    unsigned char * source_scan = source;
    unsigned char * source_scan_next = source+source_scan_size;
    unsigned char * dest_scan_Y = dest_Y;
    unsigned char * dest_scan_U = dest_U;
    unsigned char * dest_scan_V = dest_V;

    //Do all pixels
    for (int x = 0; x < half_width; x++)
    {
      int R = source_scan[0];
      int G = source_scan[1];
      int B = source_scan[2];

      //Y
      int Y = Y_FROM_RGB(B, G, R);

      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;

      int R1 = source_scan[0];
      int G1 = source_scan[1];
      int B1 = source_scan[2];

      //Y
      Y = Y_FROM_RGB(B1, G1, R1);

      R += (R1 + source_scan_next[0] + source_scan_next[3]);
      G += (G1 + source_scan_next[1] + source_scan_next[4]);
      B += (B1 + source_scan_next[2] + source_scan_next[5]);


      //YCrCb
      *dest_scan_Y = Y;
      *dest_scan_V = V_FROM_RGB(B, G, R);
      *dest_scan_U = U_FROM_RGB(B, G, R);

      source_scan += 3;
      dest_scan_Y += 1;
      dest_scan_U += 1;
      dest_scan_V += 1;
      source_scan_next += 6;
    };

    //scroll to next line
    source += source_scan_size;
    dest_Y += dest_scan_size_Y;
    dest_U += dest_scan_size_U;
    dest_V += dest_scan_size_V;

    //Start of line
    source_scan = source;
    dest_scan_Y = dest_Y;

    //Do all pixels
    for (int x = 0; x < half_width; x ++)
    {
      int R = source_scan[0];
      int G = source_scan[1];
      int B = source_scan[2];

      //Y
      int Y = Y_FROM_RGB(B, G, R);

      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;

      R = source_scan[0];
      G = source_scan[1];
      B = source_scan[2];

      //Y
      Y = Y_FROM_RGB(B, G, R);
      *dest_scan_Y = Y;
      source_scan += 3;
      dest_scan_Y += 1;
    };

    source += source_scan_size;
    dest_Y += dest_scan_size_Y;
  };
};

Answer 1

除非我遗漏了某些内容，否则以下代码似乎会在两个循环中重复出现，那么，为什么不经历这个循环呢？这可能需要对算法进行一些更改，但这会提高性能。

for (int x = 0; x < half_width; x ++) 
{ 
  int R = source_scan[0]; 
  int G = source_scan[1]; 
  int B = source_scan[2]; 

  //Y 
  int Y = Y_FROM_RGB(B, G, R); 

  *dest_scan_Y = Y; 
  source_scan += 3; 
  dest_scan_Y += 1; 

  R = source_scan[0]; 
  G = source_scan[1]; 
  B = source_scan[2];

但是，在做任何事情之前，将两个内部循环移动到单独的函数中，然后运行你的探查器，看看你是否在一个函数上花费的时间多于另一个函数。

此功能中有三个循环，您不知道哪个部分实际上是您花费时间的地方。因此，在进行任何优化之前确定，否则您可能会发现您正在修复错误的部分。

Answer 2

我不知道您使用的是哪个平台，但您可能希望查看SIMD

Arm Cotext-A8具有支持SIMD的Neon技术。您应该能够在ARM网站上找到更多信息。

Answer 3

假设他们指向的内存不重叠，您应该使用source限定符声明dest_Y，dest_U，dest_V和restrict指针，告诉编译器这个并允许它更好地进行优化。

如何优化此代码？

3 个答案: