pipeline 总结

1、简易流水线实现

参考:https://zhuanlan.zhihu.com/p/56317767

不用流水线方式

img

always@(posedge clk or negedge rst_n) begin
    if(!rst_n)
        y <= 1'b0;
    else
        y <= a_reg * b_reg + c_reg * d_reg + (e_reg - f_reg)
end

流水线方式

img

always@(posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        mux_pre_1 <= 'b0;
    	mux_pre_2 <= 'b0;
        sub_pre_1 <= 'b0;
    end
    else begin
        mux_pre_1 <=  a_reg * b_reg;
        mux_pre_2 <= c_reg * d_reg;
        sub_pre_1 <= e_reg - f_reg;
    end
end

always@(posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        add_pre_1 <= 'b0;
        sub_pre_2 <= 'b0;
    end
    else begin
        add_pre_1 <= mux_pre1 + mux_pre_2;
        sub_pre_2 <= sub_pre_1;  
    end
end

always@(posedge clk or negedge rst_n) begin
    if(!rst_n)
        add_pre_2 <= 'b0;
    else
        add_pre_2 <= add_pre_1 + sub_pre_2;
    
end

仿真结果
在这里插入图片描述

2、 valid/ready 信号结合pipeline

参考:https://blog.csdn.net/rill_zhen/article/details/45980039

module Mpipeline(
	input clk,
	input rst_n,
	
	input en_i,
	input [7:0] data_i,
	
	output en_o,
	output [7:0] data_o,
	
	output idle);
 
	wire rdy_pb2pa;
	wire vld_pa2pb;
	wire [7:0] data_pa2pb;
	
	wire rdy_pc2pb;
	wire vld_pb2pc;
     wire [7:0] data_pb2pc;
   
	
	wire rdy_pa;
	
	Mpa pa(
        .clk (clk),
		.rst_n (rst_n),
		.valid_i (en_i),
		.data_i (data_i),
		.ready_i (rdy_pb2pa),
		.ready_o (rdy_pa),
		.valid_o (vld_pa2pb),
		.data_o (data_pa2pb)	
	);
	
	Mpb pb(
		.clk (clk),
		.rst_n (rst_n),
		.valid_i (vld_pa2pb),
		.data_i (data_pa2pb),
		.ready_i (rdy_pc2pb),
		.ready_o (rdy_pb2pa),
		.valid_o (vld_pb2pc),
		.data_o (data_pb2pc)	
	);
	
	Mpc pc(
		.clk (clk),
		.rst_n (rst_n),
		.valid_i (vld_pb2pc),
		.data_i (data_pb2pc),
		.ready_i (1'b1),
		.ready_o (rdy_pc2pb),
		.valid_o (en_o),
		.data_o (data_o)	
	);
 
   assign idle = ~vld_pa2pb & ~vld_pb2pc & ~en_o;   
endmodule
 
module Mpa(
	input clk,
	input rst_n,
	
	input 			valid_i, //from pre-stage
	input [7:0] 	data_i, //from pre-stage
	input 			ready_i, //from post-stage
	
	output 			ready_o,//to pre-stage
	
	output 			valid_o, //to post-stage
	output [7:0]	data_o //to post-stage
);
 
	reg 		valid_o_r;
	reg [7:0] 	data_o_r;
	
     wire [7:0]	calc;
	assign calc = data_i + 1'b1;
    // module Mpb:assign calc = data_i << 1'b1;
    // module Mpc:assign calc = data_i - 1'b1;
	
	always @(posedge clk)
		if(~rst_n)
			valid_o_r <= 1'b0;
		else if(valid_i)
			valid_o_r <= 1'b1;
		else if(~valid_i)
			valid_o_r <= 1'b0;
			
	always @(posedge clk)
		if(~rst_n)
			data_o_r <= 8'b0;
		else if(valid_i)
			data_o_r <= calc;
	
	assign ready_o = ready_i;
	assign valid_o = valid_o_r;
	assign data_o = data_o_r;
endmodule

testbench

module Ttb;
	reg clk;
	reg rst_n;
	reg en_i_r;
	reg [7:0] data_i_r;
	
	wire en_o;
	wire [7:0] data_o;
	wire idle;
	
	Mpipeline pipeline
	(
	.clk (clk),
	.rst_n (rst_n),
	.en_i (en_i_r),
	.data_i (data_i_r),
	.en_o (en_o),
	.data_o (data_o),
	.idle (idle)
	);
	
	initial
		begin
			clk = 1'b0;
			rst_n = 1'b0;
			en_i_r = 1'b0;
			data_i_r = 8'b0;
			
			fork
				forever #5 clk = ~clk;
			join_none
			
			repeat(10) @(posedge clk);
			rst_n = 1'b1;
			repeat(10) @(posedge clk);
			
			@(posedge clk);
			en_i_r <= 1'b1;
			data_i_r <= 8'h1;
			
			@(posedge clk);
			en_i_r <= 1'b1;
			data_i_r <= 8'h2;
			
			@(posedge clk);
			en_i_r <= 1'b1;
			data_i_r <= 8'h3;
			
			@(posedge clk);
			en_i_r <= 1'b1;
			data_i_r <= 8'h4;
		    
             @(posedge clk);
			en_i_r <= 1'b0;
			data_i_r <= 8'h0;
			repeat(10) @(posedge clk);
			$finish();
		end
endmodule

思路:

  • 将几个结构相似的模块串联起来,前一个模块的输出作为后一个模块的输入,前一个模块的输出有效信号 valid_o 作为后一个模块的输入有效信号valid_i

pipeline_structure

3、流水线设计高速乘法器(移位实现)

参考:https://www.runoob.com/w3cnote/verilog-pipeline-design.html

直接用 * 或 调用IP 实现乘法,可能产生的电路性能不好。用移位实现高速乘法器,适用于硬件

设计原理

多比特数相乘,相当于被乘数按照乘数对应bit位进行移位累加

img

乘法器设计 - 非流水线

思路:将乘数展开为和结果相同位数,用位数较少的操作数作为乘数,根据乘数逐位是1或0 决定部分积是对位(部分积的最低位和对应的乘数所在位对齐)后的被乘数,还是0,然后将部分积加起来即得到结果

module mult_low
#(parameter N=4,
  parameter M=4
 )(
      input                     clk,
      input                     rstn,
      input                     data_rdy ,  //数据输入使能
      input [N-1:0]             mult1,      //被乘数
      input [M-1:0]             mult2,      //乘数

      output                    res_rdy ,   //数据输出使能
      output [N+M-1:0]          res         //乘法结果
 );

    //calculate counter 乘法周期计数器
    reg [31:0]           cnt ;
    wire [31:0]          cnt_temp = (cnt == M)? 'b0 : cnt + 1'b1 ;
    always @(posedge clk or negedge rstn) begin
        if (!rstn) begin
            cnt	<= 'b0 ;
        end
        else if (data_rdy) begin    //数据使能时开始计数
            cnt <= cnt_temp ;
        end
        else if (cnt != 0 ) begin  //防止输入使能端持续时间过短
            cnt <= cnt_temp ;
        end
        else begin
            cnt <= 'b0 ;
        end
    end

    //multiply
    reg [M-1:0]          mult2_shift ;
    reg [M+N-1:0]        mult1_shift ;
    reg [M+N-1:0]        mult1_acc ;
    always @(posedge clk or negedge rstn) begin
        if (!rstn) begin
            mult2_shift    <= 'b0 ;
            mult2_shift    <= 'b0 ;
            mult1_acc      <= 'b0 ;
        end
        else if (data_rdy && cnt=='b0) begin  //初始化
            mult1_shift    <= {{(N){1'b0}}, mult1} << 1 ; 
            mult2_shift    <= mult2 >> 1 ;  
            mult1_acc      <= mult2[0] ? {{(N){1'b0}}, mult1} : 'b0 ;
        end
        else if (cnt != M) begin
            mult1_shift    <= mult1_shift << 1 ;  //被乘数乘2
            mult2_shift    <= mult2_shift >> 1 ;  //乘数右移,方便判断
            //判断乘数对应为是否为1,为1则累加
            mult1_acc      <= mult2_shift[0] ? mult1_acc + mult1_shift : mult1_acc;
        end
        else begin
            mult2_shift    <= 'b0 ;
            mult2_shift    <= 'b0 ;
            mult1_acc      <= 'b0 ;
        end
    end

    //results
    reg [M+N-1:0]        res_r ;
    reg                  res_rdy_r ;
    always @(posedge clk or negedge rstn) begin
        if (!rstn) begin
            res_r          <= 'b0 ;
            res_rdy_r      <= 'b0 ;
        end  
        else if (cnt == M) begin
            res_r          <= mult1_acc ;  //乘法周期结束时输出结果
            res_rdy_r      <= 1'b1 ;
        end
        else begin
            res_r          <= 'b0 ;
            res_rdy_r      <= 'b0 ;
        end
    end

    assign res_rdy       = res_rdy_r;
    assign res           = res_r;

endmodule

testbench

`timescale 1ns/1ns

module test ;
    parameter    N = 8 ;
    parameter    M = 4 ;
    reg          clk, rstn;
 
   //clock
    always begin
        clk = 0 ; #5 ;
        clk = 1 ; #5 ;
    end

   //reset
    initial begin
        rstn      = 1'b0 ;
        #8 ;      rstn      = 1'b1 ;
    end

    =============================================//
    //no pipeline
    reg                  data_rdy_low ;
    reg [N-1:0]          mult1_low ;
    reg [M-1:0]          mult2_low ;
    wire [M+N-1:0]       res_low ;
    wire                 res_rdy_low ;

    //使用任务周期激励
    task mult_data_in ;  
        input [M+N-1:0]   mult1_task, mult2_task ;
        wait(!test.u_mult_low.res_rdy) ;  //not output state
        @(negedge clk ) ;
        data_rdy_low = 1'b1 ;
        mult1_low = mult1_task ;
        mult2_low = mult2_task ;
        @(negedge clk ) ;
        data_rdy_low = 1'b0 ;
        wait(test.u_mult_low.res_rdy) ; //test the output state
    endtask

    //driver
    initial begin
        #55 ;
        mult_data_in(25, 5 ) ;
        mult_data_in(16, 10 ) ;
        mult_data_in(10, 4 ) ;
        mult_data_in(15, 7) ;
        mult_data_in(215, 9) ;
    end

    mult_low  #(.N(N), .M(M))
    u_mult_low
    (
      .clk              (clk),
      .rstn             (rstn),
      .data_rdy         (data_rdy_low),
      .mult1            (mult1_low),
      .mult2            (mult2_low),
      .res_rdy          (res_rdy_low),
      .res              (res_low));

   //simulation finish
   initial begin
      forever begin
         #100;
         if ($time >= 10000)  $finish ;
      end
   end

endmodule // test

仿真结果

img

乘法器设计- 流水线

思路:将每次的数据移位(部分积最低位和乘数对应位移位对齐,乘数对应位右移,将乘数右移后的最低位作为下次的乘数,来得到最终结果)和得到部分积操作,作为流水线的基本组件

// 单次累加计算
module    mult_cell
    #(parameter N=4,
      parameter M=4)
    (
      input                     clk,
      input                     rstn,
      input                     en,
      input [M+N-1:0]           mult1,      //被乘数
      input [M-1:0]             mult2,      //乘数
      input [M+N-1:0]           mult1_acci, //上次累加结果

      output reg [M+N-1:0]      mult1_o,     //被乘数移位后保存值
      output reg [M-1:0]        mult2_shift, //乘数移位后保存值
      output reg [N+M-1:0]      mult1_acco,  //当前累加结果
      output reg                rdy );

    always @(posedge clk or negedge rstn) begin
        if (!rstn) begin
            rdy            <= 'b0 ;
            mult1_o        <= 'b0 ;
            mult1_acco     <= 'b0 ;
            mult2_shift    <= 'b0 ;
        end
        else if (en) begin
            rdy            <= 1'b1 ;
            mult2_shift    <= mult2 >> 1 ;
            mult1_o        <= mult1 << 1 ;
            if (mult2[0]) begin  				//乘数对应位为1则累加
                mult1_acco  <= mult1_acci + mult1 ;  
            end
            else begin
                mult1_acco  <= mult1_acci ; 	//乘数对应位为0则保持
            end
        end
        else begin
            rdy            <= 'b0 ;
            mult1_o        <= 'b0 ;
            mult1_acco     <= 'b0 ;
            mult2_shift    <= 'b0 ;
        end
    end

endmodule

// 顶层模块
module    mult_man
    #(parameter N=4,
      parameter M=4)
    (
      input                     clk,
      input                     rstn,
      input                     data_rdy ,
      input [N-1:0]             mult1,
      input [M-1:0]             mult2,

      output                    res_rdy ,
      output [N+M-1:0]          res );

    wire [N+M-1:0]       mult1_t [M-1:0] ;
    wire [M-1:0]         mult2_t [M-1:0] ;
    wire [N+M-1:0]       mult1_acc_t [M-1:0] ;
    wire [M-1:0]         rdy_t ;

    //第一次例化相当于初始化,不能用 generate 语句
    mult_cell      #(.N(N), .M(M))
    u_mult_step0
    (
      .clk              (clk),
      .rstn             (rstn),
      .en               (data_rdy),
      .mult1            ({{(M){1'b0}}, mult1}),
      .mult2            (mult2),
      .mult1_acci       ({(N+M){1'b0}}),
      //output
      .mult1_acco       (mult1_acc_t[0]),
      .mult2_shift      (mult2_t[0]),
      .mult1_o          (mult1_t[0]),
      .rdy              (rdy_t[0]) );

    //多次模块例化,用 generate 语句
    genvar               i ;
    generate
        for(i=1; i<=M-1; i=i+1) begin: mult_stepx
            mult_cell      #(.N(N), .M(M))
            u_mult_step
            (
              .clk              (clk),
              .rstn             (rstn),
              .en               (rdy_t[i-1]),
              .mult1            (mult1_t[i-1]),
              .mult2            (mult2_t[i-1]),
              //上一次累加结果作为下一次累加输入
              .mult1_acci       (mult1_acc_t[i-1]),
              //output
              .mult1_acco       (mult1_acc_t[i]),                                      
              .mult1_o          (mult1_t[i]),  //被乘数移位状态传递
              .mult2_shift      (mult2_t[i]),  //乘数移位状态传递
              .rdy              (rdy_t[i]) );
        end
    endgenerate

    assign res_rdy       = rdy_t[M-1];
    assign res           = mult1_acc_t[M-1];

endmodule

testbench 关键部分

    reg          data_rdy ;
    reg [N-1:0]  mult1 ;
    reg [M-1:0]  mult2 ;
    wire                 res_rdy ;
    wire [N+M-1:0]       res ;

    //driver
    initial begin
        #55 ;
        @(negedge clk ) ;
        data_rdy  = 1'b1 ;
        mult1  = 25;      mult2      = 5;
        #10 ;      mult1  = 16;      mult2      = 10;
        #10 ;      mult1  = 10;      mult2      = 4;
        #10 ;      mult1  = 15;      mult2      = 7;
        mult2      = 7;   repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 1;   repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 15;  repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 3;   repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 11;  repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 4;   repeat(32)    #10   mult1   = mult1 + 1 ;
        mult2      = 9;   repeat(32)    #10   mult1   = mult1 + 1 ;
    end

    //对输入数据进行移位,方便后续校验
    reg  [N-1:0]   mult1_ref [M-1:0];
    reg  [M-1:0]   mult2_ref [M-1:0];
    always @(posedge clk) begin
        mult1_ref[0] <= mult1 ;
        mult2_ref[0] <= mult2 ;
    end

    genvar i;
    generate
        for(i=1; i<=M-1; i=i+1) begin
            always @(posedge clk) begin
            mult1_ref[i] <= mult1_ref[i-1];
            mult2_ref[i] <= mult2_ref[i-1];
            end
        end
    endgenerate
   
    //自校验
    reg  error_flag ;
    always @(posedge clk) begin
        # 1 ;
        if (mult1_ref[M-1] * mult2_ref[M-1] != res && res_rdy) begin
            error_flag <= 1'b1 ;
        end
        else begin
            error_flag <= 1'b0 ;
        end
    end

    //module instantiation
    mult_man  #(.N(N), .M(M))
     		u_mult(
     			 .clk              (clk),
     			 .rstn             (rstn),
      			.data_rdy         (data_rdy),
     			 .mult1            (mult1),
      			.mult2            (mult2),
     			 .res_rdy          (res_rdy),
     			 .res              (res));

仿真结果

img

Logo

开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!

更多推荐