Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
X
x393
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Commits
Open sidebar
Elphel
x393
Commits
ba35eb7a
Commit
ba35eb7a
authored
Nov 01, 2015
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added modified xdct393 module to increase clock frequency
parent
30fae557
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
660 additions
and
0 deletions
+660
-0
xdct393r.v
compressor_jp/xdct393r.v
+660
-0
No files found.
compressor_jp/xdct393r.v
0 → 100644
View file @
ba35eb7a
/**********************************************************************
** -----------------------------------------------------------------------------**
** xdct393r.v
**
** 8x8 discrete Cosine Transform
** adding more registers to increase bandwidth
**
** Copyright (C) 2002-2015 Elphel, Inc
**
** -----------------------------------------------------------------------------**
** xdct393r is free software - hardware description language (HDL) code.
**
** This program is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
**
** Modified by Andrey Filippov - goal to make it work in start/stop mode, using
** "start" input (going together with the first data, no restriction on the gap between 64-pixel blocks (>=0)
** Removed "RST" input ("en" is only used to reset ping-pong transpose memory address)
** Split module in 2 stages
** Also saved some area - original design compiled by XST to 865 slices (XC2S300e), this one - 780!
**
** It is based on the original design (Xilix app. note XAPP610) by:
** Author: Latha Pillai
** Senior Applications Engineer
**
** Video Applications
** Advanced Products Group
** Xilinx, Inc.
**
** Copyright (c) 2001 Xilinx, Inc.
** All rights reserved
**
** Date: Feb. 10, 2002
**
** RESTRICTED RIGHTS LEGEND
**
** This software has not been published by the author, and
** has been disclosed to others for the purpose of enhancing
** and promoting design productivity in Xilinx products.
**
** Therefore use, duplication or disclosure, now and in the
** future should give consideration to the productivity
** enhancements afforded the user of this code by the author's
** efforts. Thank you for using our products !
**
** Disclaimer: THESE DESIGNS ARE PROVIDED "AS IS" WITH NO WARRANTY
** WHATSOEVER AND XILINX SPECIFICALLY DISCLAIMS ANY
** IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR
** A PARTICULAR PURPOSE, OR AGAINST INFRINGEMENT.
***********************************************************************/
/*
after I added DC subtraction before DCT I got 9-bit (allthough not likely to go out of 8bit range) signed data.
also increased transpose memory to 9 bits (anyway it is 16-bit wide) - see if it will help to prevent saturation
without significant increase in gates
Saturatuion is still visible on real pictures, but there was a bug - addsub<i>a_comp, addsub<i>b_comp where not using their
MSB. I added 1 more bit to add_sub<i>a and add_sub<i>b and fixed that bug. Only 2 mofre slices were used
*/
`timescale
1
ns
/
1
ps
// For xdct353 - increasing data in 9 bits -> 10 bits, out 12 bits ->13 bits
module
xdct393r
(
// increased latency by 3
input
clk
,
// system clock, posedge
input
en
,
// if zero will reset transpose memory page njumbers
input
start
,
// single-cycle start pulse that goes with the first pixel data. Other 63 should follow
input
[
9
:
0
]
xin
,
// [7:0] - input data
output
reg
last_in
,
// output high during input of the last of 64 pixels in a 8x8 block
output
pre_first_out
,
// 1 cycle ahead of the first output in a 64 block
output
dv
,
// data output valid. Will go high on the 94-th cycle after the start
output
[
12
:
0
]
d_out
)
;
// [8:0]output data
wire
stage1_done
;
wire
tm_page
;
wire
tm_we
;
wire
[
6
:
0
]
tm_ra
;
wire
[
6
:
0
]
tm_wa
;
wire
[
15
:
0
]
tm_out
;
wire
[
15
:
0
]
tm_di
;
// reg stage1_done_r; // delay by one clock to use memory output register
wire
tm_re
;
// =1'b1; // TODO: generate, for now just 1'b1
wire
tm_regen
;
always
@
(
posedge
clk
)
begin
last_in
<=
(
tm_wa
[
5
:
0
]
==
6'h30
)
;
// stage1_done_r <= stage1_done;
// tm_regen <= tm_re;
end
dct393r_stage1
i_dct_stage1
(
.
clk
(
clk
)
,
.
en
(
en
)
,
.
start
(
start
)
,
.
xin
(
xin
)
,
// [7:0]
.
we
(
tm_we
)
,
// write to transpose memory
.
wr_cntr
(
tm_wa
)
,
// [6:0] transpose memory write address
.
z_out
(
tm_di
[
15
:
0
])
,
.
page
(
tm_page
)
,
.
done
(
stage1_done
))
;
dct393r_stage2
i_dct_stage2
(
.
clk
(
clk
)
,
.
en
(
en
)
,
.
start
(
stage1_done
)
,
// stage 1 finished, data available in transpose memory (extra RAM latency)
.
page
(
tm_page
)
,
// transpose memory page finished, valid at start
.
rd_cntr
(
tm_ra
[
6
:
0
])
,
// [6:0] transpose memory read address
.
ren
(
tm_re
)
,
// output
.
regen
(
tm_regen
)
,
// output reg
.
tdin
(
tm_out
[
15
:
0
])
,
// [7:0] - data from transpose memory
.
endv
(
pre_first_out
)
,
// output
.
dv
(
dv
)
,
// data output valid
.
dct2_out
(
d_out
[
12
:
0
]))
;
// [10:0]output data
ram18_var_w_var_r
#(
.
REGISTERS
(
1
)
,
.
LOG2WIDTH_WR
(
4
)
,
.
LOG2WIDTH_RD
(
4
)
,
.
DUMMY
(
0
)
)
i_transpose_mem
(
.
rclk
(
clk
)
,
// input
.
raddr
(
{
3'b0
,
tm_ra
[
6
:
0
]
}
)
,
// input[9:0]
.
ren
(
tm_re
)
,
// input
.
regen
(
tm_regen
)
,
// input
.
data_out
(
tm_out
[
15
:
0
])
,
// output[15:0]
.
wclk
(
clk
)
,
// input
.
waddr
(
{
3'b0
,
tm_wa
[
6
:
0
]
}
)
,
// input[9:0]
.
we
(
tm_we
)
,
// input
.
web
(
4'hf
)
,
// input[3:0]
.
data_in
(
tm_di
[
15
:
0
])
// input[15:0]
)
;
endmodule
// 01/24/2004: Moved all clocks in stage 1 to "negedge" to reduce current pulses
module
dct393r_stage1
(
// increased latency by 1
input
clk
,
// system clock, posedge
input
en
,
input
start
,
// single-cycle start pulse to replace RST
input
[
9
:
0
]
xin
,
// [7:0]
output
we
,
// write to transpose memory
output
[
6
:
0
]
wr_cntr
,
// [6:0] transpose memory write address
output
reg
[
15
:
0
]
z_out
,
//data to transpose memory
output
page
,
// transpose memory page just filled (valid @ done)
output
done
)
;
// last cycle writing to transpose memory - may use after it (move it earlier?)
/* constants */
localparam
C3
=
16'd54491
;
localparam
S3
=
16'd36410
;
localparam
C4
=
16'd46341
;
localparam
C6
=
16'd25080
;
localparam
S6
=
16'd60547
;
localparam
C7
=
16'd12785
;
localparam
S7
=
16'd64277
;
reg
[
16
:
0
]
memory1a
,
memory2a
,
memory3a
,
memory4a
;
/* 1D section */
/* The max value of a pixel after processing (to make their expected mean to zero)
is 127. If all the values in a row are 127, the max value of the product terms
would be (127*2)*(23170/256) and that of z_out_int would be (127*8)*23170/256.
This value divided by 2raised to 8 is equivalent to ignoring the 8 lsb bits of the value */
reg
[
9
:
0
]
xa0_in
,
xa1_in
,
xa2_in
,
xa3_in
,
xa4_in
,
xa5_in
,
xa6_in
,
xa7_in
;
reg
[
9
:
0
]
xa0_reg
,
xa1_reg
,
xa2_reg
,
xa3_reg
,
xa4_reg
,
xa5_reg
,
xa6_reg
,
xa7_reg
;
reg
[
9
:
0
]
addsub1a_comp
,
addsub2a_comp
,
addsub3a_comp
,
addsub4a_comp
;
// reg [10:0] addsub1a_comp, addsub2a_comp, addsub3a_comp, addsub4a_comp; // AF2015: increasing width - was limiting
reg
[
10
:
0
]
add_sub1a
,
add_sub2a
,
add_sub3a
,
add_sub4a
;
reg
save_sign1a
,
save_sign2a
,
save_sign3a
,
save_sign4a
;
reg
[
17
:
0
]
p1a
,
p2a
,
p3a
,
p4a
;
wire
[
35
:
0
]
p1a_all
,
p2a_all
,
p3a_all
,
p4a_all
;
reg
toggleA
;
reg
[
18
:
0
]
z_out_int1
,
z_out_int2
;
reg
[
18
:
0
]
z_out_int
;
wire
[
15
:
0
]
z_out_prelatch
;
reg
[
2
:
0
]
indexi
;
/* clks and counters */
reg
[
6
:
0
]
wr_cntr_prelatch
;
/* memory section */
reg
done_prelatch
;
reg
we_prelatch
;
wire
enwe
;
wire
pre_sxregs
;
reg
sxregs
;
reg
page_prelatch
;
// TODO: See if negedge is needed
wire
nclk
=
~
clk
;
// seems that everything here is running at negedge (and delays too), but not the transpose memory
// to conserve energy by disabling toggleA
wire
sxregs_d8
;
reg
enable_toggle
;
// SRL16_1 i_sxregs_d8 (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs)); // dly=7+1
dly_16
#(
.
WIDTH
(
1
))
i_sxregs_d8
(
.
clk
(
nclk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd7
)
,
.
din
(
sxregs
)
,
.
dout
(
sxregs_d8
))
;
// dly=7+1
// SRL16_1 i_pre_sxregs (.Q(pre_sxregs), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(start)); // dly=6+1
dly_16
#(
.
WIDTH
(
1
))
i_pre_sxregs
(
.
clk
(
nclk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd6
)
,
.
din
(
start
)
,
.
dout
(
pre_sxregs
))
;
// dly=6+1
// SRL16_1 i_enwe (.Q(enwe), .A0(1'b1), .A1(1'b0), .A2(1'b1), .A3(1'b0), .CLK(clk), .D(pre_sxregs)); // dly=5+1
dly_16
#(
.
WIDTH
(
1
))
i_enwe
(
.
clk
(
nclk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd5
)
,
.
din
(
pre_sxregs
)
,
.
dout
(
enwe
))
;
// dly=5+1
always
@
(
posedge
nclk
)
begin
enable_toggle
<=
en
&&
(
sxregs
||
(
enable_toggle
&&
!
sxregs_d8
))
;
done_prelatch
<=
(
wr_cntr_prelatch
[
5
:
0
]
==
6'h3f
)
;
if
(
wr_cntr_prelatch
[
5
:
0
]
==
6'h3f
)
page_prelatch
<=
wr_cntr_prelatch
[
6
]
;
we_prelatch
<=
enwe
||
(
en
&&
we_prelatch
&&
(
wr_cntr_prelatch
[
5
:
0
]
!=
6'h3f
))
;
if
(
!
en
)
wr_cntr_prelatch
<=
7'b0
;
else
if
(
we_prelatch
)
wr_cntr_prelatch
<=
wr_cntr_prelatch
+
1
;
sxregs
<=
pre_sxregs
||
((
wr_cntr_prelatch
[
2
:
0
]
==
3'h1
)
&&
(
wr_cntr_prelatch
[
5
:
3
]
!=
3'h7
))
;
toggleA
<=
sxregs
||
(
enable_toggle
&&
(
~
toggleA
))
;
if
(
sxregs
)
indexi
<=
3'h7
;
else
if
(
enable_toggle
)
indexi
<=
indexi
+
1
;
end
/* 1D-DCT BEGIN */
// store 1D-DCT constant coefficient values for multipliers */
always
@
(
posedge
nclk
)
begin
case
(
indexi
)
0
:
begin
memory1a
<=
{
1'b0
,
C4
};
//8'd91
memory2a
<=
{
1'b0
,
C4
};
//8'd91
memory3a
<=
{
1'b0
,
C4
};
//8'd91
memory4a
<=
{
1'b0
,
C4
};
//8'd91
end
1
:
begin
memory1a
<=
{
1'b0
,
S7
};
//8'd126;
memory2a
<=
{
1'b0
,
C3
};
//8'd106;
memory3a
<=
{
1'b0
,
S3
};
//8'd71;
memory4a
<=
{
1'b0
,
C7
};
//8'd25;
end
2
:
begin
memory1a
<=
{
1'b0
,
S6
};
//8'd118;
memory2a
<=
{
1'b0
,
C6
};
//8'd49;
memory3a
<=
{
1'b1
,
C6
};
//-8'd49;
memory4a
<=
{
1'b1
,
S6
};
//-8'd118
end
3
:
begin
memory1a
<=
{
1'b0
,
C3
};
// 8'd106;
memory2a
<=
{
1'b1
,
C7
};
//-8'd25;
memory3a
<=
{
1'b1
,
S7
};
//-8'd126;
memory4a
<=
{
1'b1
,
S3
};
//-8'd71;
end
4
:
begin
memory1a
<=
{
1'b0
,
C4
};
// 8'd91;
memory2a
<=
{
1'b1
,
C4
};
//-8'd91;
memory3a
<=
{
1'b1
,
C4
};
//-8'd91;
memory4a
<=
{
1'b0
,
C4
};
// 8'd91;
end
5
:
begin
memory1a
<=
{
1'b0
,
S3
};
// 8'd71;
memory2a
<=
{
1'b1
,
S7
};
//-8'd126;
memory3a
<=
{
1'b0
,
C7
};
// 8'd25;
memory4a
<=
{
1'b0
,
C3
};
// 8'd106;
end
6
:
begin
memory1a
<=
{
1'b0
,
C6
};
// 8'd49;
memory2a
<=
{
1'b1
,
S6
};
//-8'd118;
memory3a
<=
{
1'b0
,
S6
};
// 8'd118;
memory4a
<=
{
1'b1
,
C6
};
//-8'd49;
end
7
:
begin
memory1a
<=
{
1'b0
,
C7
};
// 8'd25;
memory2a
<=
{
1'b1
,
S3
};
//-8'd71;
memory3a
<=
{
1'b0
,
C3
};
// 8'd106;
memory4a
<=
{
1'b1
,
S7
};
//-8'd126;
end
endcase
end
/* 8-bit input shifted 8 times through a shift register*/
// xa0_in will see output registers from posedge, may be replaced by latches if needed - but currently delay is under 5ns
always
@
(
posedge
nclk
)
begin
xa0_in
<=
xin
;
xa1_in
<=
xa0_in
;
xa2_in
<=
xa1_in
;
xa3_in
<=
xa2_in
;
xa4_in
<=
xa3_in
;
xa5_in
<=
xa4_in
;
xa6_in
<=
xa5_in
;
xa7_in
<=
xa6_in
;
end
/* shifted inputs registered every 8th clk (using cntr8)*/
always
@
(
posedge
nclk
)
if
(
sxregs
)
begin
xa0_reg
<=
xa0_in
;
xa1_reg
<=
xa1_in
;
xa2_reg
<=
xa2_in
;
xa3_reg
<=
xa3_in
;
xa4_reg
<=
xa4_in
;
xa5_reg
<=
xa5_in
;
xa6_reg
<=
xa6_in
;
xa7_reg
<=
xa7_in
;
end
/* adder / subtractor block */
always
@
(
negedge
clk
)
if
(
toggleA
==
1'b1
)
begin
add_sub1a
<=
{
xa7_reg
[
9
]
,
xa7_reg
[
9
:
0
]
}
+
{
xa0_reg
[
9
]
,
xa0_reg
[
9
:
0
]
};
add_sub2a
<=
{
xa6_reg
[
9
]
,
xa6_reg
[
9
:
0
]
}
+
{
xa1_reg
[
9
]
,
xa1_reg
[
9
:
0
]
};
add_sub3a
<=
{
xa5_reg
[
9
]
,
xa5_reg
[
9
:
0
]
}
+
{
xa2_reg
[
9
]
,
xa2_reg
[
9
:
0
]
};
add_sub4a
<=
{
xa4_reg
[
9
]
,
xa4_reg
[
9
:
0
]
}
+
{
xa3_reg
[
9
]
,
xa3_reg
[
9
:
0
]
};
end
else
begin
add_sub1a
<=
{
xa7_reg
[
9
]
,
xa7_reg
[
9
:
0
]
}
-
{
xa0_reg
[
9
]
,
xa0_reg
[
9
:
0
]
};
add_sub2a
<=
{
xa6_reg
[
9
]
,
xa6_reg
[
9
:
0
]
}
-
{
xa1_reg
[
9
]
,
xa1_reg
[
9
:
0
]
};
add_sub3a
<=
{
xa5_reg
[
9
]
,
xa5_reg
[
9
:
0
]
}
-
{
xa2_reg
[
9
]
,
xa2_reg
[
9
:
0
]
};
add_sub4a
<=
{
xa4_reg
[
9
]
,
xa4_reg
[
9
:
0
]
}
-
{
xa3_reg
[
9
]
,
xa3_reg
[
9
:
0
]
};
end
// First valid add_sub appears at the 10th clk (8 clks for shifting inputs,
// 9th clk for registering shifted input and 10th clk for add_sub
// to synchronize the i value to the add_sub value, i value is incremented
// only after 10 clks
// Adding these wires to get rid of the MSB that is always 0
wire
[
10
:
0
]
addsub1a_comp_w
=
add_sub1a
[
10
]
?
(
-
add_sub1a
)
:
add_sub1a
;
wire
[
10
:
0
]
addsub2a_comp_w
=
add_sub2a
[
10
]
?
(
-
add_sub2a
)
:
add_sub2a
;
wire
[
10
:
0
]
addsub3a_comp_w
=
add_sub3a
[
10
]
?
(
-
add_sub3a
)
:
add_sub3a
;
wire
[
10
:
0
]
addsub4a_comp_w
=
add_sub4a
[
10
]
?
(
-
add_sub4a
)
:
add_sub4a
;
always
@
(
posedge
nclk
)
begin
save_sign1a
<=
add_sub1a
[
10
]
;
save_sign2a
<=
add_sub2a
[
10
]
;
save_sign3a
<=
add_sub3a
[
10
]
;
save_sign4a
<=
add_sub4a
[
10
]
;
addsub1a_comp
<=
addsub1a_comp_w
[
9
:
0
]
;
//add_sub1a[10]? (-add_sub1a) : add_sub1a;
addsub2a_comp
<=
addsub2a_comp_w
[
9
:
0
]
;
//add_sub2a[10]? (-add_sub2a) : add_sub2a;
addsub3a_comp
<=
addsub3a_comp_w
[
9
:
0
]
;
//add_sub3a[10]? (-add_sub3a) : add_sub3a;
addsub4a_comp
<=
addsub4a_comp_w
[
9
:
0
]
;
//add_sub4a[10]? (-add_sub4a) : add_sub4a;
end
assign
p1a_all
=
addsub1a_comp
*
memory1a
[
15
:
0
]
;
// [16] is sign!
assign
p2a_all
=
addsub2a_comp
*
memory2a
[
15
:
0
]
;
assign
p3a_all
=
addsub3a_comp
*
memory3a
[
15
:
0
]
;
assign
p4a_all
=
addsub4a_comp
*
memory4a
[
15
:
0
]
;
reg
[
17
:
0
]
p1a_all_r
;
reg
[
17
:
0
]
p2a_all_r
;
reg
[
17
:
0
]
p3a_all_r
;
reg
[
17
:
0
]
p4a_all_r
;
reg
p1a_sig
,
p2a_sig
,
p3a_sig
,
p4a_sig
;
always
@
(
posedge
nclk
)
begin
p1a_all_r
<=
p1a_all
[
26
:
9
]
;
p2a_all_r
<=
p2a_all
[
26
:
9
]
;
p3a_all_r
<=
p3a_all
[
26
:
9
]
;
p4a_all_r
<=
p4a_all
[
26
:
9
]
;
p1a_sig
<=
(
save_sign1a
^
memory1a
[
16
])
;
p2a_sig
<=
(
save_sign2a
^
memory2a
[
16
])
;
p3a_sig
<=
(
save_sign3a
^
memory3a
[
16
])
;
p4a_sig
<=
(
save_sign4a
^
memory4a
[
16
])
;
end
always
@
(
posedge
nclk
)
begin
p1a
<=
p1a_sig
?
(
-
p1a_all_r
)
:
p1a_all_r
;
p2a
<=
p2a_sig
?
(
-
p2a_all_r
)
:
p2a_all_r
;
p3a
<=
p3a_sig
?
(
-
p3a_all_r
)
:
p3a_all_r
;
p4a
<=
p4a_sig
?
(
-
p4a_all_r
)
:
p4a_all_r
;
end
/* Final adder. Adding the ouputs of the 4 multipliers */
always
@
(
posedge
nclk
)
begin
z_out_int1
<=
(
{
p1a
[
17
]
,
p1a
}
+
{
p2a
[
17
]
,
p2a
}
)
;
z_out_int2
<=
(
{
p3a
[
17
]
,
p3a
}
+
{
p4a
[
17
]
,
p4a
}
)
;
z_out_int
<=
(
z_out_int1
+
z_out_int2
)
;
end
// rounding of the value
assign
z_out_prelatch
[
15
:
0
]
=
z_out_int
[
18
:
3
]
+
z_out_int
[
2
]
;
// correct rounding
// outputs from output latches to cross clock edge boundary
always
@
(
posedge
clk
)
begin
z_out
[
15
:
0
]
<=
z_out_prelatch
[
15
:
0
]
;
// wr_cntr[6:0] <= wr_cntr_prelatch[6:0];
// done <= done_prelatch;
// we <= we_prelatch;
// page <= page_prelatch;
end
dly_16
#(
.
WIDTH
(
10
))
i_delayed_outs
(
.
clk
(
clk
)
,
.
rst
(
1'b0
)
,
.
dly
(
4'd1
)
,
.
din
(
{
wr_cntr_prelatch
[
6
:
0
]
,
done_prelatch
,
we_prelatch
,
page_prelatch
}
)
,
.
dout
(
{
wr_cntr
[
6
:
0
]
,
done
,
we
,
page
}
))
;
/* 1D-DCT END */
endmodule
module
dct393r_stage2
(
// increased latency by 2 clocks
input
clk
,
// system clock, posedge
input
en
,
input
start
,
// stage 1 finished, data available in transpose memory
input
page
,
// transpose memory page finished, valid at start
output
[
6
:
0
]
rd_cntr
,
// [6:0] transpose memory read address
output
ren
,
// read enable transpose memory
output
reg
regen
,
// register enable in transpose memory
input
[
15
:
0
]
tdin
,
// [15:0] - data from transpose memory, added 6 bit fractional part
output
reg
endv
,
// one cycle ahead of starting (continuing) dv
output
reg
dv
,
// data output valid
output
reg
[
12
:
0
]
dct2_out
)
;
// [8:0]output data
/* constants */
localparam
C3
=
16'd54491
;
localparam
S3
=
16'd36410
;
localparam
C4
=
16'd46341
;
localparam
C6
=
16'd25080
;
localparam
S6
=
16'd60547
;
localparam
C7
=
16'd12785
;
localparam
S7
=
16'd64277
;
reg
[
16
:
0
]
memory1a
,
memory2a
,
memory3a
,
memory4a
;
reg
[
2
:
0
]
indexi
;
/* 2D section */
reg
[
15
:
0
]
xb0_in
,
xb1_in
,
xb2_in
,
xb3_in
,
xb4_in
,
xb5_in
,
xb6_in
,
xb7_in
;
reg
[
15
:
0
]
xb0_reg
,
xb1_reg
,
xb2_reg
,
xb3_reg
,
xb4_reg
,
xb5_reg
,
xb6_reg
,
xb7_reg
;
reg
[
16
:
0
]
add_sub1b
,
add_sub2b
,
add_sub3b
,
add_sub4b
;
reg
[
15
:
0
]
addsub1b_comp
,
addsub2b_comp
,
addsub3b_comp
,
addsub4b_comp
;
reg
save_sign1b
,
save_sign2b
,
save_sign3b
,
save_sign4b
;
reg
[
18
:
0
]
p1b
,
p2b
,
p3b
,
p4b
;
wire
[
35
:
0
]
p1b_all
,
p2b_all
,
p3b_all
,
p4b_all
;
reg
toggleB
;
reg
[
19
:
0
]
dct2d_int1
,
dct2d_int2
;
reg
[
20
:
0
]
dct_2d_int
;
wire
[
12
:
0
]
dct_2d_rnd
;
// transpose memory read address
reg
[
5
:
0
]
rd_cntrs
;
reg
rd_page
;
// start with the same as stage1
wire
sxregs
;
// to conserve energy by disabling toggleB
wire
sxregs_d8
;
reg
enable_toggle
;
reg
en_started
;
wire
pre2_endv
;
wire
pre2_disdv
;
// AF2015: was missing
reg
pre_endv
;
reg
pre_disdv
;
reg
pre_dv
;
// SRL16 i_endv (.Q(endv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(start)); // dly=14+1
// dly_16 #(.WIDTH(1)) i_endv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(start), .dout(endv)); // dly=14+1
dly_16
#(
.
WIDTH
(
1
))
i_pre2_endv
(
.
clk
(
clk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd15
)
,
.
din
(
start
)
,
.
dout
(
pre2_endv
))
;
// dly=15+1
// SRL16 i_disdv (.Q(disdv), .A0(1'b0), .A1(1'b1), .A2(1'b1), .A3(1'b1), .CLK(clk), .D(rd_cntrs[5:0]==6'h3f)); // dly=14+1
// dly_16 #(.WIDTH(1)) i_disdv(.clk(clk),.rst(1'b0), .dly(4'd14), .din(rd_cntrs[5:0]==6'h3f), .dout(disdv)); // dly=14+1
dly_16
#(
.
WIDTH
(
1
))
i_pre2_disdv
(
.
clk
(
clk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd15
)
,
.
din
(
rd_cntrs
[
5
:
0
]
==
6'h3f
)
,
.
dout
(
pre2_disdv
))
;
// dly=15+1
// SRL16 i_sxregs (.Q(sxregs), .A0(1'b0), .A1(1'b0), .A2(1'b0), .A3(1'b1), .CLK(clk),.D((rd_cntr[5:3]==3'h0) && en_started)); // dly=8+1
// dly_16 #(.WIDTH(1)) i_sxregs(.clk(clk),.rst(1'b0), .dly(4'd8), .din((rd_cntr[5:3]==3'h0) && en_started), .dout(sxregs)); // dly=8+1
dly_16
#(
.
WIDTH
(
1
))
i_sxregs
(
.
clk
(
clk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd9
)
,
.
din
((
rd_cntrs
[
2
:
0
]
==
3'h0
)
&&
en_started
)
,
.
dout
(
sxregs
))
;
// dly=9+1
// SRL16 i_sxregs_d8 (.Q(sxregs_d8), .A0(1'b1), .A1(1'b1), .A2(1'b1), .A3(1'b0), .CLK(clk),.D(sxregs && en_started)); // dly=7+1
dly_16
#(
.
WIDTH
(
1
))
i_sxregs_d8
(
.
clk
(
clk
)
,.
rst
(
1'b0
)
,
.
dly
(
4'd7
)
,
.
din
(
sxregs
&&
en_started
)
,
.
dout
(
sxregs_d8
))
;
// dly=7+1
assign
ren
=
en_started
;
always
@
(
posedge
clk
)
begin
enable_toggle
<=
en
&&
(
sxregs
||
(
enable_toggle
&&
!
sxregs_d8
))
;
// en_started <= en && (start || en_started);
if
(
!
en
)
en_started
<=
0
;
else
if
(
start
)
en_started
<=
1
;
else
if
(
rd_cntrs
[
5
:
0
]
==
6'h3f
)
en_started
<=
0
;
// should be after (start) as they happen simultaneously
regen
<=
en_started
;
pre_endv
<=
pre2_endv
;
endv
<=
pre_endv
;
// output reg
pre_disdv
<=
pre2_disdv
;
pre_dv
<=
en
&&
(
pre_endv
||
(
pre_dv
&&
~
pre_disdv
))
;
// dv <= en && (endv || (dv && ~disdv));
dv
<=
en
&&
pre_dv
;
// output reg
toggleB
<=
sxregs
||
(
enable_toggle
&&
(
~
toggleB
))
;
if
(
sxregs
)
indexi
<=
3'h7
;
else
if
(
enable_toggle
)
indexi
<=
indexi
+
1
;
if
(
start
)
rd_page
<=
page
;
if
(
start
)
rd_cntrs
[
5
:
0
]
<=
6'b0
;
// will always count, but that does not matter- What about saving energy ;-) ? Saved...
else
if
(
rd_cntrs
[
5
:
0
]
!=
6'h3f
)
rd_cntrs
[
5
:
0
]
<=
rd_cntrs
[
5
:
0
]
+
1
;
end
assign
rd_cntr
[
6
:
0
]
=
{
rd_page
,
rd_cntrs
[
2
:
0
]
,
rd_cntrs
[
5
:
3
]
};
// transposed counter
// duplicate memory<i>a from stage 1
// store 1D-DCT constant coeeficient values for multipliers */
always
@
(
posedge
clk
)
begin
case
(
indexi
)
0
:
begin
memory1a
<=
{
1'b0
,
C4
};
//8'd91
memory2a
<=
{
1'b0
,
C4
};
//8'd91
memory3a
<=
{
1'b0
,
C4
};
//8'd91
memory4a
<=
{
1'b0
,
C4
};
//8'd91
end
1
:
begin
memory1a
<=
{
1'b0
,
S7
};
//8'd126;
memory2a
<=
{
1'b0
,
C3
};
//8'd106;
memory3a
<=
{
1'b0
,
S3
};
//8'd71;
memory4a
<=
{
1'b0
,
C7
};
//8'd25;
end
2
:
begin
memory1a
<=
{
1'b0
,
S6
};
//8'd118;
memory2a
<=
{
1'b0
,
C6
};
//8'd49;
memory3a
<=
{
1'b1
,
C6
};
//-8'd49;
memory4a
<=
{
1'b1
,
S6
};
//-8'd118
end
3
:
begin
memory1a
<=
{
1'b0
,
C3
};
// 8'd106;
memory2a
<=
{
1'b1
,
C7
};
//-8'd25;
memory3a
<=
{
1'b1
,
S7
};
//-8'd126;
memory4a
<=
{
1'b1
,
S3
};
//-8'd71;
end
4
:
begin
memory1a
<=
{
1'b0
,
C4
};
// 8'd91;
memory2a
<=
{
1'b1
,
C4
};
//-8'd91;
memory3a
<=
{
1'b1
,
C4
};
//-8'd91;
memory4a
<=
{
1'b0
,
C4
};
// 8'd91;
end
5
:
begin
memory1a
<=
{
1'b0
,
S3
};
// 8'd71;
memory2a
<=
{
1'b1
,
S7
};
//-8'd126;
memory3a
<=
{
1'b0
,
C7
};
// 8'd25;
memory4a
<=
{
1'b0
,
C3
};
// 8'd106;
end
6
:
begin
memory1a
<=
{
1'b0
,
C6
};
// 8'd49;
memory2a
<=
{
1'b1
,
S6
};
//-8'd118;
memory3a
<=
{
1'b0
,
S6
};
// 8'd118;
memory4a
<=
{
1'b1
,
C6
};
//-8'd49;
end
7
:
begin
memory1a
<=
{
1'b0
,
C7
};
// 8'd25;
memory2a
<=
{
1'b1
,
S3
};
//-8'd71;
memory3a
<=
{
1'b0
,
C3
};
// 8'd106;
memory4a
<=
{
1'b1
,
S7
};
//-8'd126;
end
endcase
end
always
@
(
posedge
clk
)
begin
xb0_in
<=
tdin
;
xb1_in
<=
xb0_in
;
xb2_in
<=
xb1_in
;
xb3_in
<=
xb2_in
;
xb4_in
<=
xb3_in
;
xb5_in
<=
xb4_in
;
xb6_in
<=
xb5_in
;
xb7_in
<=
xb6_in
;
end
/* register inputs, inputs read in every eighth clk*/
always
@
(
posedge
clk
)
if
(
sxregs
)
begin
xb0_reg
<=
xb0_in
;
xb1_reg
<=
xb1_in
;
xb2_reg
<=
xb2_in
;
xb3_reg
<=
xb3_in
;
xb4_reg
<=
xb4_in
;
xb5_reg
<=
xb5_in
;
xb6_reg
<=
xb6_in
;
xb7_reg
<=
xb7_in
;
end
always
@
(
posedge
clk
)
if
(
toggleB
==
1'b1
)
begin
add_sub1b
<=
{
xb7_reg
[
15
]
,
xb7_reg
[
15
:
0
]
}
+
{
xb0_reg
[
15
]
,
xb0_reg
[
15
:
0
]
};
add_sub2b
<=
{
xb6_reg
[
15
]
,
xb6_reg
[
15
:
0
]
}
+
{
xb1_reg
[
15
]
,
xb1_reg
[
15
:
0
]
};
add_sub3b
<=
{
xb5_reg
[
15
]
,
xb5_reg
[
15
:
0
]
}
+
{
xb2_reg
[
15
]
,
xb2_reg
[
15
:
0
]
};
add_sub4b
<=
{
xb4_reg
[
15
]
,
xb4_reg
[
15
:
0
]
}
+
{
xb3_reg
[
15
]
,
xb3_reg
[
15
:
0
]
};
end
else
begin
add_sub1b
<=
{
xb7_reg
[
15
]
,
xb7_reg
[
15
:
0
]
}
-
{
xb0_reg
[
15
]
,
xb0_reg
[
15
:
0
]
};
add_sub2b
<=
{
xb6_reg
[
15
]
,
xb6_reg
[
15
:
0
]
}
-
{
xb1_reg
[
15
]
,
xb1_reg
[
15
:
0
]
};
add_sub3b
<=
{
xb5_reg
[
15
]
,
xb5_reg
[
15
:
0
]
}
-
{
xb2_reg
[
15
]
,
xb2_reg
[
15
:
0
]
};
add_sub4b
<=
{
xb4_reg
[
15
]
,
xb4_reg
[
15
:
0
]
}
-
{
xb3_reg
[
15
]
,
xb3_reg
[
15
:
0
]
};
end
// Adding these wires to get rid of the MSB that is always 0
wire
[
16
:
0
]
addsub1b_comp_w
=
add_sub1b
[
16
]
?
(
-
add_sub1b
)
:
add_sub1b
;
wire
[
16
:
0
]
addsub2b_comp_w
=
add_sub2b
[
16
]
?
(
-
add_sub2b
)
:
add_sub2b
;
wire
[
16
:
0
]
addsub3b_comp_w
=
add_sub3b
[
16
]
?
(
-
add_sub3b
)
:
add_sub3b
;
wire
[
16
:
0
]
addsub4b_comp_w
=
add_sub4b
[
16
]
?
(
-
add_sub4b
)
:
add_sub4b
;
always
@
(
posedge
clk
)
begin
save_sign1b
<=
add_sub1b
[
16
]
;
save_sign2b
<=
add_sub2b
[
16
]
;
save_sign3b
<=
add_sub3b
[
16
]
;
save_sign4b
<=
add_sub4b
[
16
]
;
addsub1b_comp
<=
addsub1b_comp_w
[
15
:
0
]
;
// add_sub1b[16]? (-add_sub1b) : add_sub1b;
addsub2b_comp
<=
addsub2b_comp_w
[
15
:
0
]
;
// add_sub2b[16]? (-add_sub2b) : add_sub2b;
addsub3b_comp
<=
addsub3b_comp_w
[
15
:
0
]
;
// add_sub3b[16]? (-add_sub3b) : add_sub3b;
addsub4b_comp
<=
addsub4b_comp_w
[
15
:
0
]
;
// add_sub4b[16]? (-add_sub4b) : add_sub4b;
end
assign
p1b_all
=
addsub1b_comp
*
memory1a
[
15
:
0
]
;
// MSB [16] is sign!
assign
p2b_all
=
addsub2b_comp
*
memory2a
[
15
:
0
]
;
assign
p3b_all
=
addsub3b_comp
*
memory3a
[
15
:
0
]
;
assign
p4b_all
=
addsub4b_comp
*
memory4a
[
15
:
0
]
;
reg
[
18
:
0
]
p1b_all_r
;
reg
[
18
:
0
]
p2b_all_r
;
reg
[
18
:
0
]
p3b_all_r
;
reg
[
18
:
0
]
p4b_all_r
;
reg
p1b_sig
,
p2b_sig
,
p3b_sig
,
p4b_sig
;
always
@
(
posedge
clk
)
begin
p1b_all_r
<=
p1b_all
[
32
:
14
]
;
p2b_all_r
<=
p2b_all
[
32
:
14
]
;
p3b_all_r
<=
p3b_all
[
32
:
14
]
;
p4b_all_r
<=
p4b_all
[
32
:
14
]
;
p1b_sig
<=
(
save_sign1b
^
memory1a
[
16
])
;
p2b_sig
<=
(
save_sign2b
^
memory2a
[
16
])
;
p3b_sig
<=
(
save_sign3b
^
memory3a
[
16
])
;
p4b_sig
<=
(
save_sign4b
^
memory4a
[
16
])
;
end
always
@
(
posedge
clk
)
begin
p1b
[
18
:
0
]
<=
p1b_sig
?
(
-
p1b_all_r
)
:
(
p1b_all_r
)
;
p2b
[
18
:
0
]
<=
p2b_sig
?
(
-
p2b_all_r
)
:
(
p2b_all_r
)
;
p3b
[
18
:
0
]
<=
p3b_sig
?
(
-
p3b_all_r
)
:
(
p3b_all_r
)
;
p4b
[
18
:
0
]
<=
p4b_sig
?
(
-
p4b_all_r
)
:
(
p4b_all_r
)
;
end
/* multiply the outputs of the add/sub block with the 8 sets of stored coefficients */
/* Final adder. Adding the ouputs of the 4 multipliers */
always
@
(
posedge
clk
)
begin
dct2d_int1
<=
(
{
p1b
[
18
]
,
p1b
[
18
:
0
]
}
+
{
p2b
[
18
]
,
p2b
[
18
:
0
]
}
)
;
dct2d_int2
<=
(
{
p3b
[
18
]
,
p3b
[
18
:
0
]
}
+
{
p4b
[
18
]
,
p4b
[
18
:
0
]
}
)
;
dct_2d_int
<=
(
{
dct2d_int1
[
19
]
,
dct2d_int1
[
19
:
0
]
}
+
{
dct2d_int2
[
19
]
,
dct2d_int2
[
19
:
0
]
}
)
;
if
(
pre_dv
)
dct2_out
[
12
:
0
]
<=
dct_2d_rnd
[
12
:
0
]
+
dct_2d_int
[
7
]
;
end
assign
dct_2d_rnd
[
12
:
0
]
=
dct_2d_int
[
20
:
8
]
;
// assign dct2_out[12:0] = dct_2d_rnd[12:0] + dct_2d_int[7];
endmodule
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment