[边缘检测算法] Sobel算子及其PC端优化提速20几倍
文章目录前言一、Sobel算子二、实现与优化1.原始版本2.第一版优化3.第二版优化4.第三版优化5.第四版优化6.第五版优化前言 Sobel是图像处理中非常常见的一种边缘检测算法,本文基于C++和OpenCV对Sobel进行了实现,并对代码进行优化。一、Sobel算子 Sobel算子是图像处理中的一个离散
前言
Sobel是图像处理中非常常见的一种边缘检测算法,本文基于C++和OpenCV对Sobel进行了实现,并对代码进行优化。
一、Sobel算子
Sobel算子是图像处理中的一个离散微分算子,经典的Sobel算子包含了一个33的矩阵(OpenCV中的Sobel是包含其它尺寸的,本文只实现33及其优化),其结构如下图所示:
如上图所示,一个是X轴方向的算子,一个是Y轴方向的算子。实际上,这两个算子都可以拆分为两个向量的卷积,这两个向量分别是Sobel平滑算子与Sobel差分算子,也就是说X轴方向的Sobel算子是可以拆分为:
而Y轴方向算子可以拆分为:
其中的[1 2 1]是3阶的非归一化的Sobel平滑算子,而[1 0 -1]是一个差分算子。
二、实现与优化
以下内容为对Sobel算子的实现和优化过程,测试图像是一张10000*7002的工业图像。
1.原始版本
最初版本,我对Sobel的实现代码如下:
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < nCols; j++)
{
for (int k = 0; k < nChan; k++)
{
int n1Pos = (i*(nCols + 2) + j)*nChan + k;
int n2Pos = (i*(nCols + 2) + j + 1)*nChan + k;
int n3Pos = (i*(nCols + 2) + j + 2)*nChan + k;
int n4Pos = ((i + 1)*(nCols + 2) + j)*nChan + k;
int n5Pos = (i*nCols + j)*nChan + k;
int n6Pos = ((i + 1)*(nCols + 2) + j + 2)*nChan + k;
int n7Pos = ((i + 2)*(nCols + 2) + j)*nChan + k;
int n8Pos = ((i + 2)*(nCols + 2) + j + 1)*nChan + k;
int n9Pos = ((i + 2)*(nCols + 2) + j + 2)*nChan + k;
int nX = abs((pSrcData[n3Pos] - pSrcData[n1Pos]
+ 2 * (pSrcData[n6Pos] - pSrcData[n4Pos])
+ pSrcData[n9Pos] - pSrcData[n7Pos]));
int nY = abs((pSrcData[n7Pos] - pSrcData[n1Pos]
+ 2 * (pSrcData[n8Pos] - pSrcData[n2Pos])
+ pSrcData[n9Pos] - pSrcData[n3Pos]));
pDstData[n5Pos] = Table[min(nX * nX + nY * nY, 65025)];
}
}
}
测试时间如下:
2.第一版优化
可以看到初步版本有很多中间计算,如n1Pos -n9Pos ,很多计算都是重复的,且与内部循环没有联系,是可以移到for循环外面的,因此有了第一版优化,代码:
uchar* pSrcData1 = NULL;
uchar* pSrcData4 = NULL;
uchar* pSrcData7 = NULL;
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
for (int i = 0; i < nRows; i++)
{
int nLine1 = i*(nCols + 2);
int nLine4 = nLine1 + (nCols + 2);
int nLine7 = nLine4 + (nCols + 2);
for (int j = 0; j < nCols; j++)
{
pSrcData1 = pSrcData + (nLine1 + j)*nChan;
pSrcData4 = pSrcData + (nLine4 + j)*nChan;
pSrcData7 = pSrcData + (nLine7 + j)*nChan;
int n5Pos = (i*nCols + j)*nChan;
for (int k = 0; k < nChan; k++)
{
int nX = abs(*(pSrcData1 + 2 * nChan + k) - *(pSrcData1 + k)
+ ((*(pSrcData4 + 2 * nChan + k) - *(pSrcData4 + k)) << 1)
+ *(pSrcData7 + 2 * nChan + k) - *(pSrcData7 + k));
int nY = abs(*(pSrcData7 + k) - *(pSrcData1 + k)
+ ((*(pSrcData7 + nChan + k) - *(pSrcData1 + nChan + k)) << 1)
+ *(pSrcData7 + 2 * nChan + k) - *(pSrcData1 + 2 * nChan + k));
pDstData[n5Pos + k] = Table[min(nX * nX + nY * nY, 65025)];
}
}
}
优化之后时间提升了170ms左右:
3.第二版优化
可以看到里面还是有冗余计算,就是Sobel算子的矩阵第1、2、3层之间的间隔关系是只跟图像的step有关系,所以不管i、j递进到第几行第几列,这个关系是不变的,那就没必要根据i、j来计算下一个计算步骤的起始点,而是直接加上间隔关系即可,并且也没必要计算第2、3行的起始点,只要计算第一行的起始点就可以,然后根据step推算出第2、3行的起始点即可。所以又进一步优化
int nChan2 = nChan * 2;
int nSrcStep = (nCols + 2)*nChan;
int nDstStep = nCols*nChan;
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
uchar* pSrcData1 = pSrcData;
uchar* pSrcData4 = pSrcData1 + nSrcStep;
uchar* pSrcData7 = pSrcData4 + nSrcStep;
uchar* pDstData4 = pDstData;
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < nCols*nChan; j++)
{
int nX = (pSrcData1[j + nChan2] - pSrcData1[j]
+ ((pSrcData4[j + nChan2] - pSrcData4[j]) << 1)
+ pSrcData7[j + nChan2] - pSrcData7[j]);
int nY = (pSrcData7[j] - pSrcData1[j]
+ ((pSrcData7[j + nChan] - pSrcData1[j + nChan]) << 1)
+ pSrcData7[j + nChan2] - pSrcData1[j + nChan2]);
pDstData4[j] = Table[min(nX * nX + nY * nY, 65025)];
}
pSrcData1 += nSrcStep;
pSrcData4 += nSrcStep;
pSrcData7 += nSrcStep;
pDstData4 += nDstStep;
}
优化后时间加速了接近240ms左右:
4.第三版优化
看到有技术大佬用SSE来加速,我也抄袭了一波,实现代码如下:
int nChan2 = nChan * 2;
int nSrcStep = (nCols + 2)*nChan;
int nDstStep = nCols*nChan;
int nBlockSize = 8;
int nBlock = nDstStep / nBlockSize;
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
uchar* pSrcData1 = pSrcData;
uchar* pSrcData4 = pSrcData1 + nSrcStep;
uchar* pSrcData7 = pSrcData4 + nSrcStep;
uchar* pDstData4 = pDstData;
__m128i Zero = _mm_setzero_si128();
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < nBlock * nBlockSize; j += nBlockSize)
{
__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j)), Zero);
__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan)), Zero);
__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan2)), Zero);
__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j)), Zero);
__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j + nChan2)), Zero);
__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j)), Zero);
__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan)), Zero);
__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan2)), Zero);
__m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2),
1)), _mm_sub_epi16(ThirdP0, ThirdP2)));
__m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1),
1)), _mm_add_epi16(ThirdP0, ThirdP2)));
__m128i GX32L = _mm_unpacklo_epi16(GX16, Zero);
__m128i GX32H = _mm_unpackhi_epi16(GX16, Zero);
__m128i GY32L = _mm_unpacklo_epi16(GY16, Zero);
__m128i GY32H = _mm_unpackhi_epi16(GY16, Zero);
__m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_add_epi32(_mm_mullo_epi32(GX32L, GX32L), _mm_mullo_epi32(GY32L,
GY32L)))));
__m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_add_epi32(_mm_mullo_epi32(GX32H, GX32H), _mm_mullo_epi32(GY32H,
GY32H)))));
_mm_storel_epi64((__m128i *)(pDstData4 + j), _mm_packus_epi16
(_mm_packus_epi32(ResultL, ResultH), Zero));
}
for (int j = nBlock * nBlockSize; j < nDstStep; j++)
{
int nX = (pSrcData1[j + nChan2] - pSrcData1[j]
+ ((pSrcData4[j + nChan2] - pSrcData4[j]) << 1)
+ pSrcData7[j + nChan2] - pSrcData7[j]);
int nY = (pSrcData7[j] - pSrcData1[j]
+ ((pSrcData7[j + nChan] - pSrcData1[j + nChan]) << 1)
+ pSrcData7[j + nChan2] - pSrcData1[j + nChan2]);
pDstData4[j] = Table[min(nX * nX + nY * nY, 65025)];
}
pSrcData1 = pSrcData1 + nSrcStep;
pSrcData4 = pSrcData4 + nSrcStep;
pSrcData7 = pSrcData7 + nSrcStep;
pDstData4 = pDstData4 + nDstStep;
}
这里用的是SSE1,在内部计算的时候用8个SSE变量记录8个连续的像素值,每个像素值用16位的数据来表达,这里可以使用_mm_loadl_epi64配合_mm_unpacklo_epi8来实现。
最终的运行时间如下,提升了100多ms:
5.第四版优化
上一版本是用SSE1指令集来优化,这一版试试SSE2指令集,代码如下:
int nChan2 = nChan * 2;
int nSrcStep = (nCols + 2)*nChan;
int nDstStep = nCols*nChan;
int nBlockSize = 8;
int nBlock = nDstStep / nBlockSize;
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
uchar* pSrcData1 = pSrcData;
uchar* pSrcData4 = pSrcData1 + nSrcStep;
uchar* pSrcData7 = pSrcData4 + nSrcStep;
uchar* pDstData4 = pDstData;
__m128i Zero = _mm_setzero_si128();
for (int i = 0; i < nRows; i++)
{
for (int j = 0; j < nBlock * nBlockSize; j += nBlockSize)
{
__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j)), Zero);
__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan)), Zero);
__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan2)), Zero);
__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j)), Zero);
__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j + nChan2)), Zero);
__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j)), Zero);
__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan)), Zero);
__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan2)), Zero);
__m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2),
1)), _mm_sub_epi16(ThirdP0, ThirdP2)));
__m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1),
1)), _mm_add_epi16(ThirdP0, ThirdP2)));
__m128i GXYL = _mm_unpacklo_epi16(GX16, GY16);
__m128i GXYH = _mm_unpackhi_epi16(GX16, GY16);
__m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_madd_epi16(GXYL, GXYL))));
__m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_madd_epi16(GXYH, GXYH))));
_mm_storel_epi64((__m128i *)(pDstData4 + j), _mm_packus_epi16
(_mm_packus_epi32(ResultL, ResultH), Zero));
}
for (int j = nBlock * nBlockSize; j < nDstStep; j++)
{
int nX = (pSrcData1[j + nChan2] - pSrcData1[j]
+ ((pSrcData4[j + nChan2] - pSrcData4[j]) << 1)
+ pSrcData7[j + nChan2] - pSrcData7[j]);
int nY = (pSrcData7[j] - pSrcData1[j]
+ ((pSrcData7[j + nChan] - pSrcData1[j + nChan]) << 1)
+ pSrcData7[j + nChan2] - pSrcData1[j + nChan2]);
pDstData4[j] = Table[min(nX * nX + nY * nY, 65025)];
}
pSrcData1 = pSrcData1 + nSrcStep;
pSrcData4 = pSrcData4 + nSrcStep;
pSrcData7 = pSrcData7 + nSrcStep;
pDstData4 = pDstData4 + nDstStep;
}
时间上有了8ms左右的提升:
6.第五版优化
然后我还想再榨干CPU,所以再优化了一版本,利用C++的多线程又实现了一波提升,代码如下:
int nChan2 = nChan * 2;
int nSrcStep = (nCols + 2)*nChan;
int nDstStep = nCols*nChan;
int nBlockSize = 8;
int nBlock = nDstStep / nBlockSize;
int nThreadSize = 64;
int nThreadBlock = nRows / nThreadSize;
uchar Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
uchar* pSrcData1 = pSrcData;
uchar* pSrcData4 = pSrcData1 + nSrcStep;
uchar* pSrcData7 = pSrcData4 + nSrcStep;
uchar* pDstData4 = pDstData;
auto _func = [](uchar* pSrcData1, uchar* pSrcData4, uchar* pSrcData7, uchar*
pDstData4,
int nBlock, int nBlockSize, int nChan, int nChan2, int nCols, __m128i Zero,
uchar* Table)
{
for (int j = 0; j < nBlock * nBlockSize; j += nBlockSize)
{
__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j)), Zero);
__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan)), Zero);
__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData1
+ j + nChan2)), Zero);
__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j)), Zero);
__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)
(pSrcData4 + j + nChan2)), Zero);
__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j)), Zero);
__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan)), Zero);
__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(pSrcData7
+ j + nChan2)), Zero);
__m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2),
1)), _mm_sub_epi16(ThirdP0, ThirdP2)));
__m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16
(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1),
1)), _mm_add_epi16(ThirdP0, ThirdP2)));
__m128i GXYL = _mm_unpacklo_epi16(GX16, GY16);
__m128i GXYH = _mm_unpackhi_epi16(GX16, GY16);
__m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_madd_epi16(GXYL, GXYL))));
__m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps
(_mm_madd_epi16(GXYH, GXYH))));
_mm_storel_epi64((__m128i *)(pDstData4 + j), _mm_packus_epi16
(_mm_packus_epi32(ResultL, ResultH), Zero));
}
for (int j = nBlock * nBlockSize; j < nCols*nChan; j++)
{
int nX = (pSrcData1[j + nChan2] - pSrcData1[j]
+ ((pSrcData4[j + nChan2] - pSrcData4[j]) << 1)
+ pSrcData7[j + nChan2] - pSrcData7[j]);
int nY = (pSrcData7[j] - pSrcData1[j]
+ ((pSrcData7[j + nChan] - pSrcData1[j + nChan]) << 1)
+ pSrcData7[j + nChan2] - pSrcData1[j + nChan2]);
pDstData4[j] = Table[min(nX * nX + nY * nY, 65025)];
}
};
std::vector<std::future<void>> fut(nThreadSize);
__m128i Zero = _mm_setzero_si128();
for (int i = 0; i < nThreadBlock; i++)
{
for (int j = 0; j < nThreadSize; j++)
{
int index_ = i*nThreadSize + j;
fut[j] = std::async(std::launch::async, _func, pSrcData1, pSrcData4,
pSrcData7, pDstData4, nBlock, nBlockSize, nChan, nChan2, nCols, Zero,
Table);
pSrcData1 = pSrcData + nSrcStep * index_;
pSrcData4 = pSrcData1 + nSrcStep;
pSrcData7 = pSrcData4 + nSrcStep;
pDstData4 = pDstData + nDstStep * index_;
}
for (int j = 0; j < nThreadSize; j++)
{
fut[j].wait();
}
}
if (nThreadBlock*nThreadSize < nRows - 1)
{
for (int i = nThreadBlock*nThreadSize; i < nRows; i++)
{
_func(pSrcData1, pSrcData4, pSrcData7, pDstData4, nBlock, nBlockSize,
nChan, nChan2, nCols, Zero, Table);
pSrcData1 = pSrcData1 + nSrcStep;
pSrcData4 = pSrcData4 + nSrcStep;
pSrcData7 = pSrcData7 + nSrcStep;
pDstData4 = pDstData4 + nDstStep;
}
}
运行时间又提高了16ms左右:
总结
本文主要是采用C++实现了3*3的Sobel算子,并通过代码优化、SSE指令集和多线程技术等方式对算法进行优化,速度从原始版本的600ms,提升到了最后的22ms,速度还是比较可以的,但是最后一版多线程的优化,如果图像不是很大的时候,其实作用不大…
更多图像处理算法和优化技术,还有待学习,路漫漫…
曾经沧海难为水,除却巫山不是云。
取次花丛懒回顾,半缘修道半缘君。
—— 元稹 《离思五首·其四》
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
所有评论(0)