From 59d1a3915add87802f2d88b81640a8caf6564ff7 Mon Sep 17 00:00:00 2001 From: ShuranXuMCHP Date: Wed, 26 Mar 2025 15:29:08 -0400 Subject: [PATCH 1/2] Updated Libero IP versions based on 2025.1 release and verified the updates by running the updated libero_flow.tcl on latest Libero capture. --- Training1/Libero/libero_flow.tcl | 8 ++++---- Training1/Libero/run_shls_on_examples.ps1 | 10 +++++----- Training1/Libero/run_shls_on_examples.sh | 10 +++++----- Training2/Libero/libero_flow.tcl | 6 +++--- Training2/Libero/run_shls_on_examples.sh | 6 +++--- Training3/Libero/libero_flow.tcl | 4 ++-- Training3/Libero/run_shls_on_examples.sh | 4 ++-- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Training1/Libero/libero_flow.tcl b/Training1/Libero/libero_flow.tcl index bdf77899..fc5a7beb 100644 --- a/Training1/Libero/libero_flow.tcl +++ b/Training1/Libero/libero_flow.tcl @@ -25,12 +25,12 @@ set Display_Controller_version 3.1.2 set HDMI_RX_version 4.2.0 set HDMI_TX_version 1.0.2 set PF_TX_PLL_version 2.0.304 -set PF_XCVR_ERM_version 3.1.205 +set PF_XCVR_ERM_version 3.1.206 set PF_XCVR_REF_CLK_version 1.0.103 set CORERESET_PF_version 2.2.107 set CORERXIODBITALIGN_version 2.1.104 -set PF_IOD_GENERIC_RX_version 2.1.110 -set PF_DDR4_version 2.5.111 +set PF_IOD_GENERIC_RX_version 2.1.113 +set PF_DDR4_version 2.5.113 set PF_SRAM_AHBL_AXI_version 1.2.111 set mipicsi2rxdecoderPF_version 2.2.5 set COREAHBTOAPB3_version 3.1.100 @@ -39,7 +39,7 @@ set CoreAPB3_version 4.1.100 set CoreGPIO_version 3.2.102 set COREJTAGDEBUG_version 3.1.100 set CoreAHBLite_version 5.4.102 -set PF_INIT_MONITOR_version 2.0.307 +set PF_INIT_MONITOR_version 2.0.308 set MIV_RV32IMA_L1_AHB_version 2.3.100 set COREUART_version 5.6.102 set Bayer_Interpolation_version 3.0.2 diff --git a/Training1/Libero/run_shls_on_examples.ps1 b/Training1/Libero/run_shls_on_examples.ps1 index 29c76c31..a88dab88 100644 --- a/Training1/Libero/run_shls_on_examples.ps1 +++ b/Training1/Libero/run_shls_on_examples.ps1 @@ -5,23 +5,23 @@ Write-Host "Running shls hw on Canny_FIFO_Switch" cd ../Canny_FIFO_Switch -shls hw +shls -a hw Write-Host "Running shls hw on Gaussian_FIFO_Pipelined" cd ../Gaussian_FIFO_Pipelined -shls hw +shls -a hw Write-Host "Running shls hw on RGB2YCbCr" cd ../RGB2YCbCr -shls hw +shls -a hw Write-Host "Running shls hw on YCbCr2RGB" cd ../YCbCr2RGB -shls hw +shls -a hw Write-Host "Running shls hw on alpha_blend" cd ../alpha_blend -shls hw +shls -a hw cd ../Libero/ diff --git a/Training1/Libero/run_shls_on_examples.sh b/Training1/Libero/run_shls_on_examples.sh index d5fa5a64..fd0cc703 100644 --- a/Training1/Libero/run_shls_on_examples.sh +++ b/Training1/Libero/run_shls_on_examples.sh @@ -6,31 +6,31 @@ echo "" echo "Running shls hw on Canny_FIFO_Switch" echo "" cd ../Canny_FIFO_Switch -shls hw +shls -a hw echo "" echo "Running shls hw on Gaussian_FIFO_Pipelined" echo "" cd ../Gaussian_FIFO_Pipelined -shls hw +shls -a hw echo "" echo "Running shls hw on RGB2YCbCr" echo "" cd ../RGB2YCbCr -shls hw +shls -a hw echo "" echo "Running shls hw on YCbCr2RGB" echo "" cd ../YCbCr2RGB -shls hw +shls -a hw echo "" echo "Running shls hw on alpha_blend" echo "" cd ../alpha_blend -shls hw +shls -a hw cd ../Libero/ diff --git a/Training2/Libero/libero_flow.tcl b/Training2/Libero/libero_flow.tcl index e3a3d92f..bb311ea9 100644 --- a/Training2/Libero/libero_flow.tcl +++ b/Training2/Libero/libero_flow.tcl @@ -24,8 +24,8 @@ set PF_CCC_version 2.2.220 set Display_Controller_version 3.1.2 set CORERESET_PF_version 2.2.107 set CORERXIODBITALIGN_version 2.1.104 -set PF_IOD_GENERIC_RX_version 2.1.110 -set PF_DDR4_version 2.5.111 +set PF_IOD_GENERIC_RX_version 2.1.113 +set PF_DDR4_version 2.5.113 set PF_SRAM_AHBL_AXI_version 1.2.111 set mipicsi2rxdecoderPF_version 2.2.5 set COREAHBTOAPB3_version 3.1.100 @@ -34,7 +34,7 @@ set CoreAPB3_version 4.1.100 set CoreGPIO_version 3.2.102 set COREJTAGDEBUG_version 3.1.100 set CoreAHBLite_version 5.4.102 -set PF_INIT_MONITOR_version 2.0.307 +set PF_INIT_MONITOR_version 2.0.308 set MIV_RV32IMA_L1_AHB_version 2.3.100 set COREUART_version 5.6.102 set Bayer_Interpolation_version 3.0.2 diff --git a/Training2/Libero/run_shls_on_examples.sh b/Training2/Libero/run_shls_on_examples.sh index 18893233..e939a39a 100644 --- a/Training2/Libero/run_shls_on_examples.sh +++ b/Training2/Libero/run_shls_on_examples.sh @@ -6,19 +6,19 @@ echo "" echo "Running shls hw on digit_recognition" echo "" cd ../digit_recognition -shls hw +shls -a hw echo "" echo "Running shls hw on RGB2YCbCr" echo "" cd ../../Training1/RGB2YCbCr -shls hw +shls -a hw echo "" echo "Running shls hw on alpha_blend" echo "" cd ../alpha_blend -shls hw +shls -a hw cd ../../Training2/Libero/ diff --git a/Training3/Libero/libero_flow.tcl b/Training3/Libero/libero_flow.tcl index 3ad20411..b9a441cc 100644 --- a/Training3/Libero/libero_flow.tcl +++ b/Training3/Libero/libero_flow.tcl @@ -23,14 +23,14 @@ new_project -location {./Libero_training3} -name {Libero_training3} -project_des #IP core version variables set PF_CCC_version 2.2.220 set CORERESET_PF_version 2.3.100 -set PF_DDR4_version 2.5.111 +set PF_DDR4_version 2.5.113 set PF_SRAM_AHBL_AXI_version 1.2.111 set COREFIFO_version 2.7.105 set COREI2C_version 7.2.101 set CoreAPB3_version 4.1.100 set CoreGPIO_version 3.2.102 set COREJTAGDEBUG_version 3.1.100 -set PF_INIT_MONITOR_version 2.0.307 +set PF_INIT_MONITOR_version 2.0.308 set MIV_RV32_version 3.0.100 set CoreUARTapb_version 5.6.102 set COREAXI4INTERCONNECT_version 2.8.103 diff --git a/Training3/Libero/run_shls_on_examples.sh b/Training3/Libero/run_shls_on_examples.sh index a321b10c..3915cc20 100644 --- a/Training3/Libero/run_shls_on_examples.sh +++ b/Training3/Libero/run_shls_on_examples.sh @@ -6,13 +6,13 @@ echo "" echo "Running shls hw on wide_mult_axi" echo "" cd ../wide_mult_axi -shls hw +shls -a hw echo "" echo "Running shls hw on texture_mapper" echo "" cd ../texture_mapper -shls hw +shls -a hw cd ../Libero/ From 942b1eea6b0c0699b382b55968b73717d0b3df40 Mon Sep 17 00:00:00 2001 From: ShuranXuMCHP Date: Wed, 26 Mar 2025 17:05:07 -0400 Subject: [PATCH 2/2] Updated links of documents for the prerequisite section. --- Training1/readme.md | 777 ++++++++++++++++++++++++++------------------ 1 file changed, 465 insertions(+), 312 deletions(-) diff --git a/Training1/readme.md b/Training1/readme.md index 5ffc9331..e13fbc95 100644 --- a/Training1/readme.md +++ b/Training1/readme.md @@ -1,4 +1,3 @@ -

SmartHLS™ Training Session 1:

Image Processing on the PolarFire® Video Kit

@@ -56,11 +55,11 @@ Updated document for outdated figures and for SmartHLS™ 2024.2 release. Before beginning this training, you should install the following software: - - Libero® SoC 2024.2 (or later) with QuestaSim Pro - - [Download](https://www.microchip.com/en-us/products/fpgas-and-plds/fpga-and-soc-design-tools/fpga/libero-software-later-versions) - - SmartHLS 2024.2 (or later): this is packaged with Libero - - DG0849 Video Control GUI used by the PolarFire board demo - - [Download Link](https://www.microchip.com/content/dam/mchp/documents/FPGA/ProductDocuments/socdesignfiles/mpf_dg0849_liberosoc_gui.zip) +- Libero® SoC 2024.2 (or later) with QuestaSim Pro + - [Download](https://www.microchip.com/en-us/products/fpgas-and-plds/fpga-and-soc-design-tools/fpga/libero-software-later-versions) +- SmartHLS 2024.2 (or later): this is packaged with Libero +- DG0849 Video Control GUI used by the PolarFire board demo + - [Download Link](https://www.microchip.com/content/dam/mchp/documents/FPGA/ProductDocuments/socdesignfiles/mpf_dg0849_liberosoc_gui.zip) This document uses the Windows versions of Libero® SoC 2024.2 and SmartHLS 2024.2. Depending on the version you use, the results generated @@ -69,22 +68,23 @@ presented in this document. You should download the training design files in advance: - - Github link to all SmartHLS trainings and examples: - - - ZIP file: - - - We’ll use the Training1 folder for this training. This folder contains the source files required for this training. - - Download `precompiled-binaries.tar.gz` from the Release Assets. This archive contains the pre-compiled bitstream required for this training. - - Alternatively, you can re-generate the bitstream and Libero project from scratch by following the instructions - here: +- Github link to all SmartHLS trainings and examples: + [https://github.com/MicrochipTech/fpga-hls-examples](https://github.com/MicrochipTech/fpga-hls-examples) + - ZIP file: + [https://github.com/MicrochipTech/fpga-hls-examples/archive/refs/heads/main.zip](https://github.com/MicrochipTech/fpga-hls-examples/archive/refs/heads/main.zip) + - We’ll use the Training1 folder for this training. This folder contains the source files required for this training. +- Download `precompiled-binaries.tar.gz` from the Release Assets. This archive contains the pre-compiled bitstream required for this training. +- Alternatively, you can re-generate the bitstream and Libero project from scratch by following the instructions + here: [https://github.com/MicrochipTech/fpga-hls-examples/tree/main/Training1/Libero](https://github.com/MicrochipTech/fpga-hls-examples/tree/main/Training1/Libero) The following hardware is required: - - PolarFire FPGA Video and Imaging Kit([MPF300-VIDEO-KIT](https://www.microsemi.com/existing-parts/parts/150747)). - - Monitor with an HDMI input. -Make sure the following demo is working on your board: [DG0849: PolarFire FPGA Dual Camera Video Kit Demo Guide](https://www.microsemi.com/document-portal/doc_download/1244036-dg0849-polarfire-fpga-4k-dual-camera-video-kit-demo-guide). +- PolarFire FPGA Video and Imaging Kit([MPF300-VIDEO-KIT](https://www.microsemi.com/existing-parts/parts/150747)). +- Monitor with an HDMI input. + +Make sure the following demo is working on your board: [DG0849: PolarFire FPGA Dual Camera Video Kit Demo Guide](https://ww1.microchip.com/downloads/aemDocuments/documents/FPGA/ApplicationNotes/ApplicationNotes/polarfire_4k_dual_camera_video_kit_dg0849_v5.pdf). -We assume you have already completed the [SmartHLS Tutorial: Sobel Filtering for Image Edge Detection](https://github.com/MicrochipTech/fpga-hls-examples/blob/main/sobel_tutorial/Sobel_Tutorial_Microsemi.pdf). +We assume you have already completed the [SmartHLS Tutorial: Sobel Filtering for Image Edge Detection](../sobel_tutorial/trainingdoc.md). We assume some knowledge of the C/C++ programming language for this training. @@ -94,8 +94,7 @@ to follow along. ## Generating the Libero Project -Before starting the training, we need to first generate the Libero project. - +Before starting the training, we need to first generate the Libero project. If you are using Windows, open the Windows command prompt (cmd) and navigate to the Libero directory, e.g.: @@ -104,6 +103,7 @@ cd C:\Workspace\fpga-hls-examples-main\Training1\Libero ``` and run the following script to generate the HLS example designs: + ```bat run_shls_on_examples.bat ``` @@ -113,8 +113,10 @@ If you are using Linux, open a terminal and navigate to the Libero directory, e. ```bash cd Workspace/fpga-hls-examples-main/Training1/Libero ``` + and run the following script to generate the HLS example designs: -``` bash + +```bash bash run_shls_on_examples.sh ``` @@ -126,49 +128,50 @@ Choose libero_flow.tcl under "Script file". In Arguments, put `GENERATE_ONLY:1`. Click 'Run'. This should take about 10 minutes. - # Overview **Time Required:** 3 hours **Goals of this Training:** - - Deeper dive into commonly used features of SmartHLS - - Demonstrate a SmartHLS design running on the PolarFire® board + +- Deeper dive into commonly used features of SmartHLS +- Demonstrate a SmartHLS design running on the PolarFire® board **Training Topics:** - - Overview of the SmartHLS tool and design flow - - What hardware blocks to design in C++ with SmartHLS vs. RTL? - - Overview of the PolarFire board and video kit demo - - Walkthrough of image processing hardware blocks designed in C++ with - SmartHLS - - Alpha Blending - - Color Space Conversion: RGB2YCbCr & YCbCr2RGB - - Gaussian blur - - Canny edge detection - - Deeper dive into SmartHLS: - - Overview of HLS pipelining - - What is the initiation interval? - - What impacts the initiation interval? - - Verification and Testing: - - Writing a C++ testbench - - How does co-simulation work? - - Showing QuestaSim waveforms during co-simulation - - External top-level hardware interface - - AXI-Stream interface (data/valid/ready) - - Input wires (from switches) - - RAM interface - - Deeper dive into HLS optimizations: - - Function pipelining, loop pipelining, FIFOs for streaming - - Canny has 4 filters streamed together using data flow - - SmartHLS C++ Library and Data Types: - - Arbitrary precision integers (ap\_int/ap\_uint) - - Fixed-point data types (ap\_fixpt/ap\_ufixpt) - - FIFO - - LineBuffer - - Export hardware blocks from SmartHLS as SmartDesign IP component - - Integration of SmartHLS SmartDesign IP component into PolarFire - Design - - Running SmartHLS hardware on the PolarFire board + +- Overview of the SmartHLS tool and design flow +- What hardware blocks to design in C++ with SmartHLS vs. RTL? +- Overview of the PolarFire board and video kit demo +- Walkthrough of image processing hardware blocks designed in C++ with + SmartHLS + - Alpha Blending + - Color Space Conversion: RGB2YCbCr & YCbCr2RGB + - Gaussian blur + - Canny edge detection +- Deeper dive into SmartHLS: + - Overview of HLS pipelining + - What is the initiation interval? + - What impacts the initiation interval? + - Verification and Testing: + - Writing a C++ testbench + - How does co-simulation work? + - Showing QuestaSim waveforms during co-simulation + - External top-level hardware interface + - AXI-Stream interface (data/valid/ready) + - Input wires (from switches) + - RAM interface +- Deeper dive into HLS optimizations: + - Function pipelining, loop pipelining, FIFOs for streaming + - Canny has 4 filters streamed together using data flow +- SmartHLS C++ Library and Data Types: + - Arbitrary precision integers (ap\_int/ap\_uint) + - Fixed-point data types (ap\_fixpt/ap\_ufixpt) + - FIFO + - LineBuffer +- Export hardware blocks from SmartHLS as SmartDesign IP component + - Integration of SmartHLS SmartDesign IP component into PolarFire + Design + - Running SmartHLS hardware on the PolarFire board # SmartHLS High-Level Synthesis Overview @@ -204,45 +207,6 @@ Hover over each icon in SmartHLS to find out their meanings.

Figure 3: SHLS design flow steps

-Starting from the left of Figure 2, the icons are: - -1) **Add Files to Project** - -Then icons for the software development flow: - -2) **Compile Software with GCC** - -3) **Run Software that was compiled** - -4) **Debug Software with gdb** - -5) **Profile Software with gprof** - -The hardware development flow icons are: - -6) **Compile Software to Hardware (Software to HDL)** - -7) **Simulate Hardware in QuestaSim with custom testbench** - -8) **Software/Hardware Co-simulation** - -9) **Synthesize Hardware to FPGA (HDL to hardware layout) – RTL Synthesis only for resource results** - -10) **Synthesize Hardware to FPGA – RTL Synthesis, place and route for timing and resource results** - -11) **Compile Software to Processor/Accelerator SoC** - -With the last three icons, you can: - -12) **Set HLS Constraints** - -13) **Launch Schedule Viewer** - -14) **Clean SmartHLS Project** - - - - These SmartHLS commands can also be run from the *SmartHLS* top bar menu. Figure 3 summarizes the SmartHLS design flow steps. We create the SmartHLS project and follow a standard software development flow using @@ -278,7 +242,7 @@ C++ reference implementation and manually convert to RTL, HLS will save you a lot of time. If your design is mainly control path and shuffling a few bits around, then use RTL. -| **Good fit for SmartHLS** | **Bad fit for HLS (use RTL instead)** | +| **Good fit for SmartHLS** | **Bad fit for HLS (use RTL instead)** | | ------------------------------------------------------------------------ | -------------------------------------------------------------------------- | | Image processing filters (edge detect, blur, noise cancellation) | Bus controller. Reason: needs precise cycle-accurate behavior | | DSP application (Viterbi Decoder) | FFT. Reason: well-known optimized hardware butterfly structure | @@ -316,70 +280,71 @@ SmartHLS generated IP components on to the PolarFire board by following the steps below: 1. Connect the USB cable from J12 on the PolarFire® board to your PC. - 2. Connect the camera board at J5 and remove the lens caps. - 3. Connect the HDMI cable from the PolarFire Video Kit (J2) to your external Monitor. - 4. Refer to [DG0849](https://www.microsemi.com/document-portal/doc_download/1244036-dg0849-polarfire-fpga-4k-dual-camera-video-kit-demo-guide) for jumper settings. We use the default jumper settings shipped with the board. - 5. Make sure all the DIP switches (SW6) are in the ON position. - 6. Connect the AC adapter to the board and power it on (SW4). - 7. Open up FlashPro Express (FPExpress v2024.2), which you can find in the Start Menu, listed under “Microchip Libero SoC v2024.2”: +

8. Select Project and New Job Project. +

9. Now select the job file “`Training1/VIDEO_KIT_TOP.job`” from the release assets folder you downloaded in step 1. - 10. Enter a project location. Click OK. - 11. Now the Programmer window will open. If you do not see the Programmer for the MPF300TS PolarFire® FPGA, then click Refresh/Rescan Programmers. +

12. Now click the RUN button to program the FPGA. - 13. After programming you should see the RUN PASSED. Now power cycle the board and close FlashPro Express. +

14. Now you should see two video streams on your monitor, one in the background and then a smaller one moving around in the foreground. If the video streams look blurry, try focusing the camera by rotating the camera lens. ->For example, if you hold the quick start card that comes with the PolarFire® board up to the camera: +> For example, if you hold the quick start card that comes with the PolarFire® board up to the camera: +

->Then you should see the following output: +> Then you should see the following output: +

15. Launch the “Video Control GUI” from the Windows Start Menu (see prerequisites section if you do not have this program installed): +

16. In the top right there is a dropdown to specify the COM port. Select the COM port (if there are multiple then choose the second highest numbered port): +

17. Now click the Red image beside the dropdown to connect to the FPGA. +

18. The image should turn green to indicate the GUI is now connected to the FPGA and the smaller video feed should become fixed to the top left corner. +

19. You can use the “Alpha” slider to test the SmartHLS generated alpha blend core. Changing the alpha affects the transparency of the smaller video feed. - 20. Now select the “Edge” checkbox to enable the SmartHLS edge detection filters. The main video feed should turn to grayscale, which has a purple tint due to the default Color Balance settings. +

21. Click the push button (SW2) to toggle between 3 modes. The current mode will be displayed on the user defined LED2-4. LED1 should be flashing and shows that the Mi-V is communicating with the FPGA fabric. > LED1 flashing: Mi-V is communicating with the FPGA. -> +> > LED2 on: Grayscale image. -> +> > LED3 on: Gaussian blur. Note: blurring effect is very subtle and only > noticeable for sharp edges and details. -> +> > LED4 on: Canny edge detection. 22. You can turn on/off each of the 4 filters in the Canny edge @@ -390,14 +355,11 @@ the steps below: switch also turns on and off the Gaussian blur filter (LED3 on). Tip: use a pen to flip the switches, you may need to break the tape covering them first. - - 1. Gaussian blur - - 2. Sobel filter - - 3. Non-maximum suppression - - 4. Hysteresis + + 1. Gaussian blur + 2. Sobel filter + 3. Non-maximum suppression + 4. Hysteresis > When you hold the same quick start card up to the camera, you should see the Canny Edge detection running on the monitor: @@ -478,46 +440,45 @@ design implementation will be covered in this training session. projects used in this training into our SmartHLS workspace. Follow the directions below. -1. Download the design files from github if you have not already (see - Prerequisites). We will use - the Training1 folder of the extracted content for this training. - -2. Open SmartHLS 2024.2 and choose a workspace. +1. Download the design files from github if you have not already (see + Prerequisites). We will use + the Training1 folder of the extracted content for this training. +2. Open SmartHLS 2024.2 and choose a workspace.

->You may want to select a new folder so you can have a blank workspace ->for this training. - ->**Warning:** Make sure there are no spaces in your workspace path. ->Otherwise, there will be an error when running synthesis (either one ->of ![](.//media/image28.png)) from SmartHLS. +> You may want to select a new folder so you can have a blank workspace +> for this training. + +> **Warning:** Make sure there are no spaces in your workspace path. +> Otherwise, there will be an error when running synthesis (either one +> of ![](.//media/image28.png)) from SmartHLS. -3. Select File -\> Import... +3. Select File -\> Import...


-4. In the Import window, select General-\>Existing Projects into - Workspace and then click Next. +4. In the Import window, select General-\>Existing Projects into + Workspace and then click Next.


-5. In the next step, check off “Copy projects into workspace” and then - select “Select root directory” and then click Browse... - - In the popup window browse to the Training1 directory and click OK. +5. In the next step, check off “Copy projects into workspace” and then + select “Select root directory” and then click Browse... + + In the popup window browse to the Training1 directory and click OK.


-6. Now in the Projects box you should see that all 9 SmartHLS projects - have been selected. Note: SmartHLS knows where the projects are by - looking for Eclipse “.project” files in the subdirectories. Click - Finish to import. +6. Now in the Projects box you should see that all 9 SmartHLS projects + have been selected. Note: SmartHLS knows where the projects are by + looking for Eclipse “.project” files in the subdirectories. Click + Finish to import.


-7. After importing you should see all 9 projects in the Project - Explorer on the left. +7. After importing you should see all 9 projects in the Project + Explorer on the left.


@@ -550,9 +511,17 @@ images is given by an alpha input coefficient. Given an input pixel with a red, green, blue (RGB) value, then the alpha blended output for each color (RGB) is given by the equation below: -$$R_{out} = R_{channel1} \times \left( 1 - alpha \right) + \ R_{channel2} \times alpha$$ -$$G_{out} = G_{channel1} \times \left( 1 - alpha \right) + \ G_{channel2} \times alpha$$ -$$B_{out} = B_{channel1} \times \left( 1 - alpha \right) + \ B_{channel2} \times alpha$$ +$$ +R_{out} = R_{channel1} \times \left( 1 - alpha \right) + \ R_{channel2} \times alpha +$$ + +$$ +G_{out} = G_{channel1} \times \left( 1 - alpha \right) + \ G_{channel2} \times alpha +$$ + +$$ +B_{out} = B_{channel1} \times \left( 1 - alpha \right) + \ B_{channel2} \times alpha +$$ In the equations above, alpha ranges from 0 to 1. But in hardware the alpha input is represented by an 8-bit value that ranges from 0 to 255. @@ -654,32 +623,23 @@ included C++ source files into the equivalent logic in Verilog. Figure 12 shows the output files and directories generated by SmartHLS after compiling to hardware. -1. Directory holding the initialization .mem files for RAMs. - -2. Directory holding reports about the hardware. - - 1. *dot\_graphs* directory holds dot files used by the Schedule - Viewer. - - 2. *hls.log* has the Console output of the last SmartHLS command - executed. - - 3. *pipelining.hls.rpt* has pipeline scheduling information used by - Scheduler Viewer. - - 4. *scheduling.hls.rpt* has scheduling information used by the - Scheduler Viewer. - - 5. *summary.hls.alpha\_blend\_smarthls.rpt* has a summary of the - other reports as well as interface and RAM information. - -3. Generated Verilog design. - -4. Generated VHDL wrapper for Verilog design. - -5. TCL script to import Verilog design into SmartDesign. - -6. QuestaSim script to display module ports in a hierarchy. +1. Directory holding the initialization .mem files for RAMs. +2. Directory holding reports about the hardware. + + 1. *dot\_graphs* directory holds dot files used by the Schedule + Viewer. + 2. *hls.log* has the Console output of the last SmartHLS command + executed. + 3. *pipelining.hls.rpt* has pipeline scheduling information used by + Scheduler Viewer. + 4. *scheduling.hls.rpt* has scheduling information used by the + Scheduler Viewer. + 5. *summary.hls.alpha\_blend\_smarthls.rpt* has a summary of the + other reports as well as interface and RAM information. +3. Generated Verilog design. +4. Generated VHDL wrapper for Verilog design. +5. TCL script to import Verilog design into SmartDesign. +6. QuestaSim script to display module ports in a hierarchy.


Figure 12: SmartHLS Output Files


@@ -688,6 +648,7 @@ compiling to hardware. automatically (this can also be found under the reports directory in the Project Explorer). We can see the RTL interface of the generated SmartHLS Alpha blending block by scrolling down to Section 1: + ``` ====== 1. RTL Interface ====== @@ -775,6 +736,7 @@ void alpha_blend_smarthls(hls::FIFO &input_fifo, #pragma HLS function top ``` + The top-level C++ function will be compiled by SmartHLS into the top-level Verilog module. You can tell that this is the top-level by the SmartHLS pragma: “`function top`". @@ -784,9 +746,11 @@ the top-level function. We start with the simpler second argument “`output_fifo`” which has the type: + ```c hls::FIFO ``` + The \< \> brackets surround the C++ template argument which defines the data type stored in the FIFO. In this case the FIFO holds `rgb_t` data. You can mouse over the `rgb_t` to display the type definition: @@ -903,6 +867,7 @@ The `ap_uint` syntax `out(R1, R2)` is used to write to a specific range of bits within `out`'s 24 bits. In this case, we are writing 8 bits to the range of bits from 23:16 corresponding to the red pixel. R1 and R2 are defined as (R2=16, R1=23): + ```c // 23:16 red const int R2 = 2*W; @@ -911,7 +876,7 @@ const int R1 = R2 + W-1; Similarly, the `in.channel1(R1, R2)` syntax reads the 8-bit red pixel value (23:16) from the 24-bit channel1 input. - + ## SmartHLS Schedule Viewer ![](.//media/image2.png)Now that we have generated the hardware with @@ -939,15 +904,18 @@ called [LLVM intermediate representation LLVM IR is beneficial. For example, given the 32-bit C++ code: + ```c result = a + b – 5 ``` This C++ code could be represented as instructions in LLVM IR as: + ``` %0 = add i32 %a, %b %result = sub i32 %0, 5 ``` + In LLVM IR, intermediate variables are prefixed with a “%”. Each operation (add/sub) includes the bitwidth “`i32`" indicating 32-bit integer. The add operands are `%a + %b` and the result is stored in a @@ -961,6 +929,7 @@ LLVM IR always has a label at the beginning and a branching instruction at the end (`br`, `ret`, etc.) Here the `body.0` basic block performs some operations and then branches unconditionally to another basic block labeled `body.1`. Control flow occurs between basic blocks. + ``` body.0: %0 = add i32 %a, %b @@ -1048,6 +1017,7 @@ Steady State Reached.


The 2 iterations/inputs until steady state corresponds to the Pipeline Depth from the SmartHLS report file `summary.hls.alpha_blend_smarthls.rpt` file we saw previously: + ``` ====== 2. Function and Loop Scheduling Results ====== @@ -1093,6 +1063,7 @@ see this output in the Console: Alpha = 127 PASS! ``` + The “PASS\!” is printed by our `main()` testbench function on line 204 when the output image matches the golden expected output image. You can visually confirm yourself by clicking on the output image file: @@ -1145,22 +1116,26 @@ whether `FAST_COSIM` is defined or not: ```c input_channel1 = read_bmp(INPUT_IMAGE1, &input_channel1_header); ``` + Same with the second input channel, which will read either “`polarfire.bmp`" or “`polarfire_100x56.bmp`”: ```c input_channel2 = read_bmp(INPUT_IMAGE2, &input_channel2_header); ``` + The golden expected output will read either “`golden_output.bmp`” or “`golden_output_100x56.bmp`”: ```c golden_output_image = read_bmp(GOLDEN_OUTPUT, &golden_output_image_header); ``` + In our C++ testbench on line 147, we first perform a sanity check test based on the waveform in the alpha blending SolutionCore documentation ([UG0641](https://www.microsemi.com/document-portal/doc_download/135316-ug0641-alpha-blending-user-guide) page 4) shown in Figure 17. + ```c // test 1: sanity check from alpha blend IP core documentation in.channel1 = ap_uint<24>("0x456712"); @@ -1177,6 +1152,7 @@ from the `output_fifo`. Finally, we validate the output was expected. If there was a mismatch, we print out the value and then return a non-zero value from main so that the co-simulation will `FAIL`. Co-simulation will only pass if the main function returns zero. + ```c // test 1: sanity check from alpha blend IP core documentation in.channel1 = ap_uint<24>("0x456712"); @@ -1191,21 +1167,26 @@ if (out != ap_uint<24>("4C6E57")) { return 1; } ``` + Next, starting from line 160, we run alpha blending on the two input image files. We specify the input alpha value of 50%, which is represented by the 8-bit value 127: + ```c in.alpha = (int)(255 * 0.5); ``` + We loop over each pixel (`WIDTH` x `HEIGHT`) of the input images. When reading from a BMP image file, consecutive pixels in the same row of the image are stored next to each other (row-major order). Therefore, the outer loop is over the image `HEIGHT` and the inner loop is over the `WIDTH` of the image: + ```c for (int i = 0; i < HEIGHT; i++) { for (int j = 0; j < WIDTH; j++) { ``` + Note: this loop order does not matter in this example since we do not use the `i` or `j` indexes inside the loop body. At the end of the loop, we increment all the pointers for each of the images to the next pixel in @@ -1215,35 +1196,42 @@ In the loop body, we use the `ap_uint` concatenation operator “`(R, G, B)`” to assign the 24-bit input channels. The red pixel will be the most-significant 8 bits of the 24-bit input channel and the blue pixel will be the least-significant 8 bits. + ```c // concatenation operator in.channel1 = (ap_uint<8>(input_channel1->r), ap_uint<8>(input_channel1->g), ap_uint<8>(input_channel1->b)); ``` + After we write to the `input_fifo` we call the top-level function `alpha_blend_smarthls`, and then we read the output from the `output_fifo`. We extract out the 8-bit RGB values from the 24-bit output: + ```c rgb_t rgb = output_fifo.read(); output_image_ptr->r = rgb(R1, R2); output_image_ptr->g = rgb(G1, G2); output_image_ptr->b = rgb(B1, B2); ``` + Then we verify the output pixel matches the expected pixel. We return 1 from main if there is a mismatch. At the end of the main function we write the alpha blended image to the “`output.bmp`” file: + ```c write_bmp("output.bmp", &input_channel1_header, output_image); ``` + We reuse the same BMP header data (image properties like width and height) as the input channel 1 image. And we print a message and return 0 from the main function to indicate to co-simulation that the testbench passed. + ```c printf("PASS!\n"); return 0; @@ -1265,17 +1253,21 @@ clicking the plus button. The `FAST_COSIM` define will change the input image to be 100x56 bmp files (instead of 1080p images). This change will speed up the co-simulation time considerably (from 20 min to 2 min): + ```c // uncomment this line to test on a smaller image for faster co-simulation #define FAST_COSIM ``` + ![](.//media/image2.png)Since the code changed, we should recompile (![](.//media/image59.png)) and rerun (![](.//media/image60.png)) the software verify that the software still passes on this new input: + ``` Alpha = 127 PASS! ``` + If you open the `output.bmp` image, you will notice the dimensions are now much smaller. @@ -1284,6 +1276,7 @@ SmartHLS (![](.//media/image61.png)) to regenerate the hardware. Now, we start co-simulation (![](.//media/image62.png)) which will take a few minutes to finish. You should verify that the following results appear in the Console: + ``` +--------------------------+-----------------+--------------------------+----------------------------+-----------------------+ | Top-Level Name | Number of calls | Simulation time (cycles) | Call Latency (min/max/avg) | Call II (min/max/avg) | @@ -1296,22 +1289,23 @@ SW/HW co-simulation: PASS 16:45:59 Build Finished (took 1m:4s.590ms) ``` + The “`SW/HW co-simulation: PASS`” indicates that the simulation was successful and the `main()` testbench function returned 0. The SmartHLS co-simulation flow performs the following 3 steps automatically: -1. SmartHLS runs your `main()` testbench function in software. All inputs - to the top-level function are saved in input test vector files. -2. SmartHLS generates an RTL testbench that will read the input test - vector files from step 1. SmartHLS uses QuestaSim to simulate the RTL - testbench and SmartHLS-generated Verilog. The module outputs are - saved into output simulation files. -3. SmartHLS reruns your `main()` testbench function in software but - replaces the top-level function calls with the return value from the - output simulation files from step 2. If the hardware outputs are - correct then the `main()` function will still return 0 (`PASS`). +1. SmartHLS runs your `main()` testbench function in software. All inputs + to the top-level function are saved in input test vector files. +2. SmartHLS generates an RTL testbench that will read the input test + vector files from step 1. SmartHLS uses QuestaSim to simulate the RTL + testbench and SmartHLS-generated Verilog. The module outputs are + saved into output simulation files. +3. SmartHLS reruns your `main()` testbench function in software but + replaces the top-level function calls with the return value from the + output simulation files from step 2. If the hardware outputs are + correct then the `main()` function will still return 0 (`PASS`). The co-simulation flow is useful to run as a sanity check that the SmartHLS generated hardware is correct and to report the number of clock @@ -1334,14 +1328,14 @@ We are targeting PolarFire® MPF300TS device. Click OK: The SmartHLS project device setting does a few things: -1) Sets up internal operator delay models for the target family. These - delay models are used by SmartHLS to decide how much pipelining to - add in the circuit to meet the Fmax constraint. -2) Passes the part number to Libero® SoC when running FPGA synthesis, - place, and route (![](.//media/image65.png)) to get resource/Fmax - results. -3) Account for FPGA family-specific issues – for example SmartFusion2 - RAMs do not support power-up initialization. +1) Sets up internal operator delay models for the target family. These + delay models are used by SmartHLS to decide how much pipelining to + add in the circuit to meet the Fmax constraint. +2) Passes the part number to Libero® SoC when running FPGA synthesis, + place, and route (![](.//media/image65.png)) to get resource/Fmax + results. +3) Account for FPGA family-specific issues – for example SmartFusion2 + RAMs do not support power-up initialization. ## Design FPGA Implementation: Resources and Timing @@ -1389,6 +1383,7 @@ When the HLS core is integrated into a larger system, the system Fmax may be low * Interface 4LUTs and DFFs are occupied due to the uses of LSRAM, Math, and uSRAM. Number of interface 4LUTs/DFFs = (36 * #.LSRAM) + (36 * #.Math) + (12 * #.uSRAM) = (36 * 0) + (36 * 6) + (12 * 0) = 216. ``` + The demo design we want to integrate this block into has a required clock period of 6.734 ns. This means the synthesized period of the Alpha Blending block must be at most 6.734 ns. @@ -1431,9 +1426,11 @@ Browse.


Now navigate to your `Libero.exe`, for example: + ``` C:\Microchip\Libero_SoC_2024.2\Designer\bin\libero.exe ``` + Click OK.


@@ -1471,15 +1468,14 @@ SolutionCore to only consider Fabric resources in Table 3. Interface LUTs/DFFs)

-| | **SmartHLS Alpha Blend** | **SolutionCore Alpha Blend** | -| ---------------- | ------------------------ | ---------------------------- | -| **Fabric 4LUTs** | 153 | 273 | -| **Fabric DFFs** | 12 | 242 | +| | **SmartHLS Alpha Blend** | **SolutionCore Alpha Blend** | +| ---------------------- | ------------------------------ | ---------------------------------- | +| **Fabric 4LUTs** | 153 | 273 | +| **Fabric DFFs** | 12 | 242 |


- -## SmartHLS Design Complexity vs SolutionCore RTL +## SmartHLS Design Complexity vs SolutionCore RTL We can now compare the complexity of the original alpha blend SolutionCore Verilog design and the SmartHLS C++ design. We have @@ -1514,55 +1510,57 @@ SmartHLS-generated Alpha Blend block and import the IP component into SmartDesign. This will showcase the design flow for integrating SmartHLS generated Verilog Cores into Libero® SoC SmartDesign. -1. Open the `alpha_blend.cpp` source file in the `alpha_blend` project in - the Project Explorer. -


-2. Click the “Compile Software to Hardware” button - ![](.//media/image80.png) on the top toolbar. -3. Launch Libero SoC 2024.2 and open the project: “`Libero/Libero_training1/Libero_training1.prjx`” you generated as part of the [prerequisites](#generating-the-libero-project) . - On Windows, if you see errors about missing files or errors in Synthesis, you will need to move the project to a directory with a short name (such as `C:\Downloads` or `C:\Workspace`) to avoid issues with long filenames. - - Note: The Libero project was created when SmartHLS still had the - name “LegUp”, so you might see some places with the word LegUp. This - will be addressed in a future version of the training. - -4. Navigate to the Design Hierarchy and search for “`alpha_blend`”. - Right click the `alpha_blend_top` design component and select Remove - Core Definition, then right click again and Unlink. We want to avoid - any duplicate blocks when importing the new `alpha_blend_top` HDL+ - block from SmartHLS. +1. Open the `alpha_blend.cpp` source file in the `alpha_blend` project in + the Project Explorer. + +


+2. Click the “Compile Software to Hardware” button + ![](.//media/image80.png) on the top toolbar. +3. Launch Libero SoC 2024.2 and open the project: “`Libero/Libero_training1/Libero_training1.prjx`” you generated as part of the [prerequisites](#generating-the-libero-project) . + On Windows, if you see errors about missing files or errors in Synthesis, you will need to move the project to a directory with a short name (such as `C:\Downloads` or `C:\Workspace`) to avoid issues with long filenames. + + Note: The Libero project was created when SmartHLS still had the + name “LegUp”, so you might see some places with the word LegUp. This + will be addressed in a future version of the training. +4. Navigate to the Design Hierarchy and search for “`alpha_blend`”. + Right click the `alpha_blend_top` design component and select Remove + Core Definition, then right click again and Unlink. We want to avoid + any duplicate blocks when importing the new `alpha_blend_top` HDL+ + block from SmartHLS. +


-5. Without clearing the search, double click the `video_pipelining` - SmartDesign file to open the `video_pipelining` SmartDesign Canvas. +5. Without clearing the search, double click the `video_pipelining` + SmartDesign file to open the `video_pipelining` SmartDesign Canvas. +


-6. Find the `alpha_blend_top` module which should now be red. -


+6. Find the `alpha_blend_top` module which should now be red. -7. On the top toolbar, click Project-\>Execute Script... and run the - `create_hdl_plus.tcl` file in the - `alpha_blend\hls_output\scripts\libero` SmartHLS project directory. - SmartDesign will open a report window when it finishes. Make sure - the script executed successfully and close the report window. -


- The tcl script may replace the component and the module will no longer be red. - If the block is still red, follow step 8. +


-8. Right click the red `alpha_blend_top_0` block and select Replace - Component… to replace the block with the newly imported - `alpha_blend_top`. +7. On the top toolbar, click Project-\>Execute Script... and run the + `create_hdl_plus.tcl` file in the + `alpha_blend\hls_output\scripts\libero` SmartHLS project directory. + SmartDesign will open a report window when it finishes. Make sure + the script executed successfully and close the report window. -


+


+ The tcl script may replace the component and the module will no longer be red. + If the block is still red, follow step 8. +8. Right click the red `alpha_blend_top_0` block and select Replace + Component… to replace the block with the newly imported + `alpha_blend_top`. - If this component is not in the list, you can instantiate it manually from the Design Hierarchy: -


- And then connect it manually to replace the red module. +


-9. Click the “Generate Component” (![](.//media/image88.png)) button in - the SmartDesign toolbar for video\_pipelining and its parent - component VIDEO\_KIT\_TOP. + If this component is not in the list, you can instantiate it manually from the Design Hierarchy: +


+ And then connect it manually to replace the red module. +9. Click the “Generate Component” (![](.//media/image88.png)) button in + the SmartDesign toolbar for video\_pipelining and its parent + component VIDEO\_KIT\_TOP. 10. The alpha\_blend block has now been integrated and the project is ready for synthesis, place, and route. We skip this step for now since this will take 1-2 hours. @@ -1587,9 +1585,10 @@ In the pipeline schedule, the circuit can finish in 5 cycles and starts a new load every clock cycle. On cycle 3, the pipelined circuit is executing a Load, Comp, and Store from three different loop iterations in parallel, fully utilizing the FPGA hardware resources. +


-Figure 18: Comparing sequential versus pipelined hardware operations.

+Figure 18: Comparing sequential versus pipelined hardware operations.`

` When pipelining, SmartHLS will automatically analyze dependencies and partition operations into pipeline stages to minimize the ***initiation @@ -1600,10 +1599,12 @@ into the pipeline every clock cycle. Loop pipelining can be achieved in SmartHLS with the loop pipeline pragma or the function pipeline pragma: + ``` #pragma HLS loop pipeline #pragma HLS function pipeline ``` + Loop pipelining only applies to a specific loop in a C++ function. Meanwhile, function pipelining is applied to an entire C++ function and SmartHLS will automatically unroll all loops in that function. @@ -1618,6 +1619,7 @@ examples of pipelines where the initiation interval cannot be 1. ![](.//media/image2.png)In the Project Explorer tab, click the project `pipeline_hazards` and open `pipeline_hazards.cpp`. +


![](.//media/image2.png)There are three functions in this file @@ -1626,6 +1628,7 @@ Before we look at the functions, compile the project to hardware ![](.//media/image80.png) to verify that the pipelines generated have II greater than 1. Near the bottom of the Console output, you should find the following: + ``` Info: Generating pipeline for loop on line 10 of pipeline_hazards.cpp with label "for_loop_pipeline_hazards_cpp_10_2". Pipeline initiation interval = 3. @@ -1634,12 +1637,14 @@ Info: Generating pipeline for loop on line 18 of pipeline_hazards.cpp with label Info: Generating pipeline for loop on line 28 of pipeline_hazards.cpp with label "for_loop_pipeline_hazards_cpp_28_2". Pipeline initiation interval = 2. ``` + SmartHLS prints out pipelining information for each loop in the Console. This confirms that the three pipelines in the three examples have II greater than 1. SmartHLS also prints this information to the `summary.hls.main.rpt` file found in the reports directory. +


![](.//media/image2.png)Double click summary.hls.main.rpt to open it and @@ -1647,6 +1652,7 @@ then scroll down to section 2: Function and Loop Scheduling Results. Scroll to the right to see the same loop pipelining information. Notice there is more information here than in the Console output, such as the pipeline length. Now close the file. + ``` ====== 2. Function and Loop Scheduling Results ====== +-----------------------------------------------------------+ @@ -1677,6 +1683,7 @@ multiply, and 1 cycle for the store before starting every loop iteration. Therefore, the pipeline initiation interval is 3 cycles (1 + 1 + 1). A diagram of how the pipeline schedule would look is presented in Figure 19. + ```c void cross_iteration_dependency( volatile int array[N] ) { #pragma HLS loop unroll factor(1) @@ -1686,6 +1693,7 @@ void cross_iteration_dependency( volatile int array[N] ) { } } ``` +


Figure 19: Example of initiation interval of 3 due to cross-iteration dependency.


@@ -1693,6 +1701,7 @@ dependency.


generated from compiling the project to hardware in the previous step. Near the bottom of the Console there is the following output. You might need to scroll up a bit to see it. + ``` Info: Cross-iteration dependency does not allow initiation interval (II) of 1. Dependency (distance = 1) from 'store' operation for array 'array' (at line 11 of pipeline_hazards.cpp) to @@ -1743,6 +1752,7 @@ interval must be 2 due to resource contention on the single multiplier. In the schedule of Figure 20, there is only one multiply operation in any clock cycle (column). A diagram of how the pipeline would look like is presented in Figure 20. + ```c void functional_unit_contention( volatile int array[N] ) { #pragma HLS loop unroll factor(1) @@ -1754,11 +1764,13 @@ void functional_unit_contention( volatile int array[N] ) { } } ``` +


Figure 20: Example of functional unit contention in a loop pipeline


![](.//media/image2.png)In the Console output, find the messages about resource constraints generated for this pipeline. This should be above the messages generated for the pipeline in the previous example. + ``` Info: Resource constraint limits initiation interval to 2. Resource 'signed_multiply_32' has 2 uses per cycle but only 1 units @@ -1772,6 +1784,7 @@ available. | | Total # of Competing Uses | 2 | +-----------------------+---------------------------------+---------------------+ ``` + This table shows the operations that caused resource contention in the pipeline. SmartHLS mentions that there are 2 uses of the functional unit “`signed_multiply_32`" but only one unit available. @@ -1798,6 +1811,7 @@ until two cycles later. The pipeline initiation interval must be 2 due to resource contention on the read/write ports. In the schedule of Figure 21 there is only one iteration performing memory operation in any clock cycle (column). + ```c void memory_contention( volatile int array[N] ) { #pragma HLS loop unroll factor(1) @@ -1807,6 +1821,7 @@ void memory_contention( volatile int array[N] ) { } } ``` +


Figure 21: Example of memory contention in a loop pipeline. Two loads @@ -1820,6 +1835,7 @@ will then dictate the II of the entire pipeline. ![](.//media/image2.png)In the Console output, find the messages about resource constraints generated for this pipeline. This should be above the messages generated for the pipeline in the previous example. + ``` Info: Resource constraint limits initiation interval to 2. Resource '@main_entry_array@_local_memory_port' has 3 uses per @@ -1908,18 +1924,18 @@ interface for the RGB2YCbCr is described in Table 1. Table 4: RGB2YCbCr SolutionCore IP Interface -| **Signal Name** | **Direction** | **Width** | **Description** | -| --------------- | ------------- | --------- | ---------------------- | -| RESETN\_I | Input | 1-bit | Active low async reset | -| SYS\_CLK\_I | Input | 1-bit | System Clock | -| RED\_I | Input | 8-bits | Red input pixel | -| GREEN\_I | Input | 8-bits | Green input pixel | -| BLUE\_I | Input | 8-bits | Blue input pixel | -| DATA\_VALID\_I | Input | 1-bit | Input data valid | -| Y\_OUT\_O | Output | 8-bits | Y luma output | -| Cb\_OUT\_O | Output | 8-bits | Cb chroma output | -| Cr\_OUT\_O | Output | 8-bits | Cr chroma output | -| DATA\_VALID\_O | Output | 1-bit | Output data valid | +| **Signal Name** | **Direction** | **Width** | **Description** | +| --------------------- | ------------------- | --------------- | ---------------------- | +| RESETN\_I | Input | 1-bit | Active low async reset | +| SYS\_CLK\_I | Input | 1-bit | System Clock | +| RED\_I | Input | 8-bits | Red input pixel | +| GREEN\_I | Input | 8-bits | Green input pixel | +| BLUE\_I | Input | 8-bits | Blue input pixel | +| DATA\_VALID\_I | Input | 1-bit | Input data valid | +| Y\_OUT\_O | Output | 8-bits | Y luma output | +| Cb\_OUT\_O | Output | 8-bits | Cb chroma output | +| Cr\_OUT\_O | Output | 8-bits | Cr chroma output | +| DATA\_VALID\_O | Output | 1-bit | Output data valid | The desired RTL interface splits up the input red, green, blue values @@ -1927,7 +1943,11 @@ into three separate 8-bit inputs sharing a data valid. In contrast to the Alpha Blend module which combined the RGB 8-bits values into a single 24-bit input. -## RGB2YCbCr Block + +The desired RTL interface splits up the input red, green, blue values +into three separate 8-bit inputs sharing a data valid. In contrast to +the Alpha Blend module which combined the RGB 8-bits values into a +single 24-bit input.## RGB2YCbCr Block ![](.//media/image2.png)In the SmartHLS project explorer, double click the “RGB2YCbCr” project and open up the `RGB2YCbCr.cpp` file. @@ -1937,6 +1957,7 @@ the “RGB2YCbCr” project and open up the `RGB2YCbCr.cpp` file. ![](.//media/image2.png)Now run SmartHLS Compile Software to Hardware (click the ![](.//media/image39.png) button) and look at the `summary.hls.RGB2YCbCr_smarthls.rpt` in section 1 for the RTL interface: + ``` ====== 1. RTL Interface ====== @@ -1968,6 +1989,7 @@ the “RGB2YCbCr” project and open up the `RGB2YCbCr.cpp` file. The Verilog top-level module ports that are not listed in the above table are unused. Please tie the unused input ports to 0. ``` + The SmartHLS generated top-level interface matches our desired RTL interface from Table 4. @@ -1975,14 +1997,17 @@ interface from Table 4. the top-level function “`RGB2YCbCr_smarthls`” on line 27 to see the function signature that gets generated into the above interface. This function is also pipelined and has two arguments: + ```c void RGB2YCbCr_smarthls(hls::FIFO &input_fifo, hls::FIFO &output_fifo) { #pragma HLS function top #pragma HLS function pipeline ``` + The `input_fifo` argument is of type `hls::FIFO`. With the `RGB` type is defined above as struct with three 8-bit RGB values: + ```c const int RGB_BITWIDTH = 8; struct RGB { @@ -1991,8 +2016,10 @@ struct RGB { ap_uint B; }; ``` + The `output_fifo` argument is of type `hls::FIFO`. With the `YCbCr` type is defined above as struct with three 8-bit YCbCr values: + ```c const int YCBCR_BITWIDTH = 8; struct YCbCr { @@ -2008,11 +2035,13 @@ struct, and all elements will share the same 1-bit valid/ready signals. Now if we look in the body of the top-level function RGB2YCbCr, the line calculating the Y (luma) component corresponds to Equation 1: + ```c ycbcr.Y = fixpt_t(16) + ((fixpt_t( 65.738)*in.R + fixpt_t(129.057)*in.G + fixpt_t(25.064)*in.B) >> 8) + fixpt_t(0.5); ``` + The right shift by 8 corresponds to the divide by 256 in Equation 1. The final addition of 0.5 is for rounding since C/C++ will always round down to the nearest integer. @@ -2021,13 +2050,16 @@ For this computation we are using a 18-bit fixed-point type with 10 integer bits and 8 fractional bits (Q10.8) as defined below using the `ap_fixpt` SmartHLS arbitrary precision fixed-point data type (see [SmartHLS documentation](https://onlinedocs.microchip.com/v2/keyword-lookup?keyword=hls_fixed_point&redirect=true&version=latest)): + ```c typedef ap_fixpt<18, 10> fixpt_t; ``` + ![](.//media/image2.png)Now we will quickly simulate the design in software (![](.//media/image59.png) and ![](.//media/image60.png)) to verify its functionality. You should see the following output in the Console meaning that the software simulation has passed: + ``` Expected: Y=16 Cb=128 Cr=128 Actual: Y=16 Cb=128 Cr=128 @@ -2043,6 +2075,7 @@ PASS 22:43:07 Build Finished (took 29s.345ms) ``` + Using SmartHLS fixed-point data types can improve productivity by avoiding error prone RTL code that requires the designer to manually keep track of the decimal place location after various operations. @@ -2052,6 +2085,7 @@ initialization and the underlying fixed-point representation. ![](.//media/image2.png)For example, we can print the fixed point representation of `fixpt_t(65.738)` by adding this code in the main function on line 104 after the test case validation loop: + ```c std::cout << fixpt_t(65.738).to_fixpt_string(10) << std::endl; std::cout << "= " << fixpt_t(65.738).to_double() << std::endl; @@ -2061,28 +2095,35 @@ Now recompile (![](.//media/image59.png)) and rerun (![](.//media/image60.png)) the software. The Console will print out the fixed-point underlying 18-bit decimal value of 16,828 which represents right before it prints PASS: + ``` 16828 * 2^-8 = 65.7344 ``` + ![](.//media/image2.png)By default, `ap_fixpt` will truncate bits to bring the result closer to negative infinity. If you add `AP_RND` to the `fixpt_t` typedef on line 25: + ```c typedef ap_fixpt<18, 10, AP_RND> fixpt_t; ``` + Then save, recompile and rerun software simulation. You will find the fixed-point representation will get closer to the desired 65.738 value: + ``` 16829 * 2^-8 = 65.7383 ``` + For this hardware block, more precise rounding is not necessary so remove this change and save. ![](.//media/image2.png)Undo the above changes, and run the Co-simulation to verify that the generated RTL is correct, you should see this output in the Console: + ``` +------------------------+-----------------+--------------------------+... | Top-Level Name | Number of calls | Simulation time (cycles) |... @@ -2117,8 +2158,9 @@ waveforms: Figure 23: SW/HW Co-Simulation with Waveforms for RGB2YCbCr SmartHLS Core


We can look in the C++ main function for the input test vectors, for -example on line 80, the 5th test input and expected output is +example on line 80, the 5``th`` test input and expected output is given below: + ```c // test 5 in.R = 119; in.G = 138; in.B = 152; @@ -2126,8 +2168,9 @@ input_fifo.write(in); expected.Y = 131; expected.Cb = 137; expected.Cr = 119; expected_fifo.write(expected); ``` + In the waveforms in Figure 23, the first cursor highlights when the -5th test vector is input to the design under test (DUT) on +5``th`` test vector is input to the design under test (DUT) on clock cycle 5 (see `cycle_count` signal). The correct output is received on clock cycle 7 as highlighted by the second cursor. Therefore, the hardware pipeline has a latency of 2 clock cycles (7 – 5 = 2). You can @@ -2139,6 +2182,7 @@ every clock cycle, indicating a pipeline initiation interval of 1. latency reported is 4 cycles. The various reports may differ by 1-2 cycles due to handshaking start/finish signals with the instantiating module. + ``` ====== 2. Timing Result of HLS-generated IP Core (top-level module: RGB2YCbCr_smarthls_top) ====== @@ -2197,15 +2241,16 @@ by constant into adds with shifts-by-constant. In this case, we can save
Table 6: SmartHLS Strength Reduction Optimization -| **Multiply by Constant** | **Fixed Point Representation** | **Equivalent shifts-by-constant and adds** | -| ------------------------ | ------------------------------ | ------------------------------------------------------ | -| 129.057 | 33,038 x 2-8 | \- (1 \<\< 1) + (1 \<\< 4) + (1 \<\< 8) + (1 \<\< 15) | -| 25.064 | 6,416 x 2-8 | \+ (1 \<\< 4) + (1 \<\< 8) + (1 \<\< 11) + (1 \<\< 12) | -| 112.439 (used twice) | 28,784 x 2-8 | \- (1 \<\< 4) + (1 \<\< 7) - (1 \<\< 12) + (1 \<\< 15) | -| 18.285 | 4,680 x 2-8 | \+ (1 \<\< 3) + (1 \<\< 6) + (1 \<\< 9) + (1 \<\< 12) | +| **Multiply by Constant** | **Fixed Point Representation** | **Equivalent shifts-by-constant and adds** | +| ------------------------------ | ------------------------------------ | ------------------------------------------------------ | +| 129.057 | 33,038 x 2``-8`` | \- (1 \<\< 1) + (1 \<\< 4) + (1 \<\< 8) + (1 \<\< 15) | +| 25.064 | 6,416 x 2``-8`` | \+ (1 \<\< 4) + (1 \<\< 8) + (1 \<\< 11) + (1 \<\< 12) | +| 112.439 (used twice) | 28,784 x 2``-8`` | \- (1 \<\< 4) + (1 \<\< 7) - (1 \<\< 12) + (1 \<\< 15) | +| 18.285 | 4,680 x 2``-8`` | \+ (1 \<\< 3) + (1 \<\< 6) + (1 \<\< 9) + (1 \<\< 12) |

+ We can turn off the SmartHLS strength reduction pass to see the difference in resources. @@ -2230,6 +2275,7 @@ Now in the Project Explorer, right click and select New -\> File: Enter the file name of “`custom_config.tcl`”. This should match the file name entered in the Set HLS Constraints previously. Click Finish: +


The custom Tcl file allows us to enter advanced SmartHLS Tcl @@ -2243,6 +2289,7 @@ changes. This will turn off (0) the SmartHLS strength reduction ![](.//media/image2.png)Now rerun compile software to hardware (![](.//media/image120.png)). Then rerun FPGA synthesis (![](.//media/image112.png)). The new resources should be: + ``` ====== 2. Timing Result of HLS-generated IP Core (top-level module: RGB2YCbCr_smarthls_top) ====== @@ -2272,6 +2319,7 @@ When the HLS core is integrated into a larger system, the system Fmax may be low * Interface 4LUTs and DFFs are occupied due to the uses of LSRAM, Math, and uSRAM. Number of interface 4LUTs/DFFs = (36 * #.LSRAM) + (36 * #.Math) + (12 * #.uSRAM) = (36 * 0) + (36 * 5) + (12 * 0) = 180. ``` + Now close all project files. ## YCbCr2RGB Block @@ -2285,27 +2333,32 @@ the “YCbCr2RGB” project and open up the `YCbCr2RGB.cpp` file. The top-level function is `YCbCr2RGB_smarthls()` and implements Equation 2 in fixed-point math: +


In this design, the fixed-point type needed 11 integer bits (vs 10 integer bits for RGB2YCbCr). + ```c // Fixed point type: Q11.7 // 11 integer bits and 7 fractional bits typedef ap_fixpt<18, 11> fixpt_t; ``` + Why was this change needed? To avoid overflow caused by larger numbers in the equations. We also need to perform saturation, which converts negative values to 0, and values greater than 255 to 255. We can do this using an 8-bit unsigned `ap_ufixpt` type with the `AP_SAT` option: + ```c // saturate values to [0, 255] range rgb.R = ap_ufixpt<8, 8, AP_TRN, AP_SAT>(R); rgb.G = ap_ufixpt<8, 8, AP_TRN, AP_SAT>(G); rgb.B = ap_ufixpt<8, 8, AP_TRN, AP_SAT>(B); ``` + From the SmartHLS [user guide](https://onlinedocs.microchip.com/v2/keyword-lookup?keyword=hls_arbitary_precision&redirect=true&version=latest), the `AP_SAT` option means that on positive and negative overflow, @@ -2315,6 +2368,7 @@ respectively. ![](.//media/image2.png)Compile (![](.//media/image59.png)) and run (![](.//media/image60.png)) the software to verify software correctness. You should see “PASS” printed in the Console: + ``` Expected: R=0 G=136 B=0 Actual: R=0 G=136 B=0 @@ -2334,9 +2388,11 @@ PASS 23:17:16 Build Finished (took 1s.0ms) ``` + ![](.//media/image2.png)After compiling software to hardware (![](.//media/image122.png)) the following RTL interface should be shown in the `summary.hls.YCbCr2RGB_smarthls.rpt` file: + ``` ====== 1. RTL Interface ====== @@ -2365,9 +2421,11 @@ in the `summary.hls.YCbCr2RGB_smarthls.rpt` file: | | | output_fifo_G | 8 | output | +-------------+----------------+---------------------------------+------------------+------------------+ ``` + ![](.//media/image2.png)After running SmartHLS co-simulation (![](.//media/image123.png)) you should see the hardware passes all tests with the following output in the Console: + ``` +------------------------+-----------------+--------------------------+----------------------------+-----------------------+ | Top-Level Name | Number of calls | Simulation time (cycles) | Call Latency (min/max/avg) | Call II (min/max/avg) | @@ -2377,9 +2435,11 @@ tests with the following output in the Console: Simulation time (cycles): 11 SW/HW co-simulation: PASS ``` + ![](.//media/image2.png)Finally, if you run FPGA synthesis (![](.//media/image112.png)) you should see the following expected output in `summary.results.rpt`: + ``` ====== 2. Timing Result of HLS-generated IP Core (top-level module: YCbCr2RGB_smarthls_top) ====== @@ -2409,6 +2469,7 @@ When the HLS core is integrated into a larger system, the system Fmax may be low * Interface 4LUTs and DFFs are occupied due to the uses of LSRAM, Math, and uSRAM. Number of interface 4LUTs/DFFs = (36 * #.LSRAM) + (36 * #.Math) + (12 * #.uSRAM) = (36 * 0) + (36 * 5) + (12 * 0) = 180. ``` + Now close all project files. # Gaussian Blur Filter Block @@ -2451,6 +2512,7 @@ correspond to a Gaussian distribution centered at the middle element sum back to a value between 0 and 255. The values of the filter are specifically chosen so that the `DIVISOR` is a power of 2, making the hardware implementation of the divide a right-shift instead of a divide. + ```c const unsigned int KERNEL_SIZE = 5; @@ -2470,6 +2532,7 @@ We will start with a basic implementation of the Gaussian Blur Filter. Scroll down to the `gaussian_filter_memory()` function on line 25. Notice this function is marked as the top-level function by the function top pragma: + ```c // Gaussian Filter. void gaussian_filter_memory(hls::ap_uint<1> on, @@ -2482,6 +2545,7 @@ void gaussian_filter_memory(hls::ap_uint<1> on, There are two array arguments to the top-level function which represents the input image and the filtered output image: + ```c unsigned char input_buffer[][WIDTH], unsigned char output_buffer[][WIDTH] @@ -2492,28 +2556,35 @@ specify that these two array arguments of “memory” type interface have a certain depth. The depth of the memory must also be specified for the co-simulation, since our C++ testbench in `main()` does not use arrays with static size. + ```c #pragma HLS interface argument(input_buffer) type(memory) num_elements(SIZE) #pragma HLS interface argument(output_buffer) type(memory) num_elements(SIZE) ``` + There is also a third input called “on” which is an unsigned int of size 1. + ```c hls::ap_uint<1> on, ``` + This input will be connected to DIP switch 1 (SW6) in the demo design and turns on or off the Gaussian Blur Filter. On line 38, if the switch is turned off (\!on) then we will pass the input directly to the output: + ```c if (!on || out_of_bounds) { output_buffer[i][j] = input_buffer[i][j]; continue; } ``` + The filtering algorithm can be seen in the main loop on line 43. The 5x5 area around the current pixel under consideration is multiplied with its corresponding Gaussian coefficient. The result is summed, normalized then stored in the output array. + ```c unsigned int sum = 0; for (unsigned int m = 0; m < KERNEL_SIZE; m++) { @@ -2533,15 +2604,18 @@ output_buffer[i][j] = (unsigned char)sum; (![](.//media/image54.png)) on the top bar and then click the run software (![](.//media/image55.png)) button. You should see the output in the Console stating that it passed: + ``` Result: 2073600 RESULT: PASS ``` + The testbench for this design is found in the `main()` function on line 59. This is very similar to the testbench of the Alpha Blending design where a 1920x1080 bmp image is read as input. There is also a golden output bmp image used to compare with the pixels generated by the filter implementation `gaussian_filter_memory()`. + ```c gaussian_filter_memory(on, input_image, output_image_gaussian); @@ -2553,14 +2627,14 @@ for (i = 0; i < HEIGHT; i++) { output_image_ptr->r = hw; output_image_ptr->g = hw; output_image_ptr->b = hw; - + if (hw != gold) { printf("ERROR: "); printf("i = %d j = %d gold = %d hw = %d\n", i, j, gold, hw); } else { matching++; } - + output_image_ptr++; golden_output_image++; } @@ -2588,6 +2662,7 @@ this training we will run with a smaller image. defined on line 5 and then save the file. The commented out `FAST_COSIM` define might be folded into the comment by eclipse and needs to be expanded by clicking the plus button. + ```c // uncomment this line to test on a smaller image for faster co-simulation #define FAST_COSIM @@ -2600,22 +2675,27 @@ generating the hardware to be exported to SmartDesign, otherwise the generated hardware will be for the incorrect input size. This change is necessary as the function depends on the image sizes in the for-loops on line 34. + ```c for (int i = 0; i < HEIGHT; i++) { for (int j = 0; j < WIDTH; j++) { ``` + ![](.//media/image2.png)You can recompile (![](.//media/image59.png)) and rerun (![](.//media/image60.png)) the software to verify correctness on the new input: + ``` Result: 5600 RESULT: PASS 21:58:06 Build Finished (took 5s.849ms) ``` + ![](.//media/image2.png)Now rerun SmartHLS to generate the hardware (![](.//media/image127.png)) and then run co-simulation with QuestaSim (click the button ![](.//media/image62.png)). You should see the following output in the Console stating that the co-sim has passed: + ``` +----------------------------+-----------------+--------------------------+----------------------------+-----------------------+ | Top-Level Name | Number of calls | Simulation time (cycles) | Call Latency (min/max/avg) | Call II (min/max/avg) | @@ -2628,6 +2708,7 @@ SW/HW co-simulation: PASS 09:54:26 Build Finished (took 37s.205ms) ``` + This version of the Gaussian filter is very similar to a software implementation of a Gaussian filter. However, there are multiple ways to improve C++ code to get better hardware performance. @@ -2650,6 +2731,7 @@ to be external to the SmartHLS block itself and only provides the control signals to read and write to the memory based on the loads and stores from inside the function. Also notice that the `ap_uint` argument becomes a single input wire at the interface: + ``` +---------------------------------------------------------------------------------------------------------+ | RTL Interface Generated by SmartHLS | @@ -2695,48 +2777,61 @@ execution. ![](.//media/image2.png)A simple case is if your main function ever returns a non-zero value in software. For example, change the `main()` function to always return 1 on line 129 in `gaussian_filter.cpp`: + ```c //return result_incorrect; return 1; ``` + Now run co-simulation and you will see the output: + ``` Error: Running C testbench failed. Make sure main() returns 0. ``` + Now undo the change. ![](.//media/image2.png)Another time that co-simulation could fail is if the user specifies an incorrect value in a SmartHLS pragma. For example, specifying an incorrect depth on a memory interface such as the following on line 29: + ```c #pragma HLS interface argument(input_buffer) type(memory) num_elements(SIZE) ``` + For example, we can try changing the correct SIZE array depth to a wrong value like 10: + ```c #pragma HLS interface argument(input_buffer) type(memory) num_elements(10) ``` Now we rerun SmartHLS to generate the hardware (![](.//media/image127.png)): + ``` Error: Expect the specified depth (10) for argument 'input_buffer' to be a multiple of the combined depth of the inner dimensions (100). Please change the specified depth to a multiple of the combined inner dimension depth (100). ``` + We were not able to get to the co-simulation stage, since SmartHLS was able to detect that the depth was not a multiple of the `WIDTH` (which is 100): + ```c unsigned char input_buffer[][WIDTH], ``` + ![](.//media/image2.png)We can try another wrong array depth which is a multiple of 100 to avoid this SmartHLS check: + ```c #pragma HLS interface argument(input_buffer) type(memory) num_elements(100) ``` + Now rerun SmartHLS to generate the hardware (![](.//media/image127.png)). Since SmartHLS relies on the user to set the correct depth value, SmartHLS does not realize the depth is wrong @@ -2744,6 +2839,7 @@ and will not give an error message. Now when we rerun co-simulation (![](.//media/image128.png)) we will see that co-simulation fails: + ``` +----------------------------+-----------------+--------------------------+----------------------------+-----------------------+ | Top-Level Name | Number of calls | Simulation time (cycles) | Call Latency (min/max/avg) | Call II (min/max/avg) | @@ -2753,6 +2849,7 @@ that co-simulation fails: Simulation time (cycles): 88,021 SW/HW co-simulation: FAIL ``` + In this case, the generated circuit is still correct, but SmartHLS’s automatically generated co-simulation testbench is incorrect. Because we specified the wrong depth, the co-simulation testbench is now missing @@ -2772,7 +2869,7 @@ project files. We will continue trying to improve the base Gaussian Filter design. -![](.//media/image2.png)Open the `Gaussian_Memory_Interface_Pipelined` +![](.//media/image2.png)Open the `Gaussian_Memory_Interface_Pipelined` project and then open the `gaussian_filter.cpp` source file.


@@ -2783,9 +2880,11 @@ pragma, the loop body will automatically be partitioned into pipeline stages. The module will also only run the pipeline for the number of iterations of the loop before requiring the start signal to be re-asserted. This optimization should increase throughput considerably. + ```c #pragma HLS loop pipeline ``` + Note, loop pipelining will flatten the loop body by inlining any functions and unrolling any loops. This is to make sure the loop body can be properly analyzed and partitioned into pipeline stages. As the @@ -2795,26 +2894,31 @@ loop body. Not only would this use a massive amount of resources, it will also slow down compilation considerably, both of which we want to avoid. To work around this, the double for loop can be collapsed into a single for loop so that no loop unrolling needs to occur. + ```c #pragma HLS loop pipeline for (int i = 0; i < (HEIGHT * WIDTH); i++) { unsigned int pos_i = i / WIDTH; unsigned int pos_j = i % WIDTH; ``` + ![](.//media/image2.png)Now run “Compile Software to Hardware” (click the ![](.//media/image39.png) button). Look in the Console to find the message about loop pipelining. This message states that the initiation interval of the pipeline is 13 and the number of stages is 23. + ``` Info: Done pipelining the loop on line 35 of gaussian_filter.cpp with label "for_loop_gaussian_filter_cpp_35_5". Pipeline Initiation Interval (II) = 13. Pipeline length = 18. ``` + We can see that there is memory contention within the loop pipeline that prevents the initiation interval from becoming 1 in the SmartHLS Info message: + ``` Info: Pipelining the loop on line 35 of gaussian_filter.cpp with label "for.loop:gaussian_filter.cpp:35:5". Info: Assigning new label to the loop on line 35 of gaussian_filter.cpp with label "for_loop_gaussian_filter_cpp_35_5" @@ -2860,6 +2964,7 @@ line 48 of `gaussian_filter.cpp` but there are only 2 memory ports to use (dual-port RAM in FPGA). If we look at line 48 of `gaussian_filter.cpp` we find that all the loads come from the image values read from `input_buffer` used in calculating the new filtered value. + ```c for (unsigned int m = 0; m < KERNEL_SIZE; m++) { for (unsigned int n = 0; n < KERNEL_SIZE; n++) { @@ -2869,11 +2974,13 @@ for (unsigned int m = 0; m < KERNEL_SIZE; m++) { } } ``` + Why is there no memory contention for the `GAUSSIAN` 5x5 array which is also accessed every iteration? Because SmartHLS unrolls the loops and realizes that `GAUSSIAN` is a constant array. Therefore, SmartHLS can automatically replace `GAUSSIAN` array accesses with constant values, becoming equivalent to the following: + ```c unsigned int sum = 0; sum += ((unsigned int)input_buffer[pos_i + 0 - center] @@ -2898,6 +3005,7 @@ Note, the Iteration Count and Latency are much larger than the ones we saw when running co-sim in the design without pipelining, but this is due to the design being generated for the full 1920x1080 input while the co-sim we ran used the reduced 100x56 input. + ``` ====== 2. Function and Loop Scheduling Results ====== @@ -2909,15 +3017,19 @@ co-sim we ran used the reduced 100x56 input. | for.loop:gaussian_filter.cpp:35:5 | line 35 of gaussian_filter.cpp | 2073600 | 16 | 13 | 26956803 | +-----------------------------------+--------------------------------+------------+-------------------+----+---------------+ ``` + ![](.//media/image2.png)Now uncomment `FAST_COSIM` on `define.hpp` line 5, save, then rerun SmartHLS to generate the hardware (![](.//media/image127.png)) and then run co-simulation with QuestaSim (click the button ![](.//media/image62.png)). + ```c // uncomment this line to test on a smaller image for faster co-simulation #define FAST_COSIM ``` + Once co-sim finishes, you should see the following output in the Console: + ``` +--------------------------------------+-----------------+--------------------------+----------------------------+-----------------------+ | Top-Level Name | Number of calls | Simulation time (cycles) | Call Latency (min/max/avg) | Call II (min/max/avg) | @@ -2930,9 +3042,9 @@ SW/HW co-simulation: PASS 10:18:42 Build Finished (took 1m:13s.112ms) ``` + With this pipeline optimization and pipeline length reduced to 16, the -time to process one frame becomes approximately `HEIGHT * WIDTH * 13 + -LATENCY` (100\*56\*13+16 = 72,816), which is a significant improvement +time to process one frame becomes approximately `HEIGHT * WIDTH * 13 + LATENCY` (100\*56\*13+16 = 72,816), which is a significant improvement over the previous design. We can see that this has reduced the cycle latency from 88,021 to 72,816 which is a ~17% reduction in latency. @@ -3009,28 +3121,35 @@ open the `gaussian_filter.cpp` source file. Scroll down to the `gaussian_filter_pipelined()` top-level function on line 45. Both the `input_fifo` and `output_fifo` function arguments are now FIFO interfaces. + ```c void gaussian_filter_pipelined(hls::ap_uint<1> on_switch, hls::FIFO &input_fifo, hls::FIFO &output_fifo) { ``` + Now scroll down to line 58. The loop within `gaussian_filter_pipelined()` is still loop pipelined. + ```c #pragma HLS loop pipeline for (unsigned int i = 0; i < (HEIGHT * WIDTH + LineBufferFillCount); i++) { ``` + On line 52, the declaration of the LineBuffer takes as C++ template arguments: the data type, the width of the image processed and the size of the filter. These arguments to tell the LineBuffer how much memory to allocate for the internal buffer. + ```c hls::LineBuffer line_buffer; ``` + Every iteration of the loop, there will be a new pixel that gets shifted into the internal array of the LineBuffer. We want to pre-fill the line buffer to have all the necessary pixels to filter the first image pixel before we start the filtering. + ```c line_buffer.ShiftInPixel(input_pixel); @@ -3040,9 +3159,11 @@ if (!is_filled(KERNEL_SIZE, i)) { continue; } ``` + Once we fill the LineBuffer, we filter the image as normal on line 84 by using the *window* member of the LineBuffer which provides the pixels in the window of the pixel currently being processed. + ```c unsigned int sum = 0; for (unsigned int m = 0; m < KERNEL_SIZE; m++) { @@ -3054,6 +3175,7 @@ for (unsigned int m = 0; m < KERNEL_SIZE; m++) { sum /= DIVISOR; output_fifo.write((unsigned char)sum); ``` + ![](.//media/image2.png)Using FIFOs and the LineBuffer data structure, we can reduce the initiation interval of the pipeline to 1 and process one pixel every single cycle. To see this, compile the design to @@ -3061,15 +3183,18 @@ hardware (![](.//media/image39.png)). Upon successful pipelining, you should find the following message in the Console output stating that the pipeline initiation interval is 1: + ``` Info: Generating pipeline for loop on line 59 of gaussian_filter.cpp with label "for_loop_gaussian_filter_cpp_59_5". Pipeline initiation interval = 1.  ``` + This result can also be found in the summary.hls.gaussian\_filter\_pipelined.rpt under section 2. Find *gaussian\_filter\_pipelined* and scroll to the right to see the pipeline result information. + ``` ====== 2. Function and Loop Scheduling Results ====== @@ -3081,15 +3206,19 @@ pipeline result information. | for.loop:gaussian_filter.cpp:59:5 | line 59 of gaussian_filter.cpp | 2077442 | 3 | 1 | 2077444 | +-----------------------------------+--------------------------------+------------+-------------------+----+---------------+ ``` + Also note, further up in the Console output you can find a console message stating that a LineBuffer memory has been partitioned. + ``` Info: Partitioning memory: line_buffer into 30 partitions. ``` + ![](.//media/image2.png)Go to `summary.hls.gaussian_filter_pipelined.rpt` and scroll to section 3. There are additional partitioned memories that can be found here that are not mentioned in the Console. + ``` +-------------------------------------------------------------------------------------------------------------------------------------------+ | Local Memories | @@ -3102,8 +3231,10 @@ mentioned in the Console. | gaussian_filter_pipelined_BB_0_line_buffer_prev_ro_2 | gaussian_filter_pipelined | RAM | 15360 | 8 | 1920 | 1 | +------------------------------------------------------+---------------------------+------+-------------+------------+-------+--------------+ ``` + See section 1 of the reports to verify the interface ports which have now changed to FIFOs. + ``` ====== 1. RTL Interface ====== @@ -3130,10 +3261,12 @@ now changed to FIFOs. | on_switch | Scalar Argument | on_switch | 1 | input | +-------------+--------------------+---------------------------------+------------------+------------------+ ``` + ![](.//media/image2.png)Again, uncomment `FAST_COSIM` in `define.hpp` line 5, save, then rerun SmartHLS to generate the hardware (![](.//media/image127.png)) and then run co-simulation with QuestaSim (click the button ![](.//media/image62.png)). + ```c // uncomment this line to test on a smaller image for faster co-simulation #define FAST_COSIM @@ -3155,6 +3288,7 @@ SW/HW co-simulation: PASS 10:33:51 Build Finished (took 1m:12s.814ms) ``` +   Notice that the cycle latency has been further reduced to 5,809. This can be found roughly by `HEIGHT * WIDTH + LATENCY` (100\*56+6=5606). This @@ -3164,13 +3298,16 @@ the version without pipelining (88,021) by 93%. ![](.//media/image2.png)Now re-comment FAST\_COSIM in define.hpp, save, then rerun SmartHLS (![](.//media/image127.png)) to regenerate the hardware for 1920x1080 inputs. + ```c // uncomment this line to test on a smaller image for faster co-simulation // #define FAST_COSIM ``` + ![](.//media/image2.png)Synthesize to design to FPGA (![](.//media/image112.png)) and check the FMAX and resource usage in the `summary.results.rpt` file. + ``` ====== 2. Timing Result of HLS-generated IP Core (top-level module: gaussian_filter_pipelined_top) ====== @@ -3200,6 +3337,7 @@ When the HLS core is integrated into a larger system, the system Fmax may be low * Interface 4LUTs and DFFs are occupied due to the uses of LSRAM, Math, and uSRAM. Number of interface 4LUTs/DFFs = (36 * #.LSRAM) + (36 * #.Math) + (12 * #.uSRAM) = (36 * 4) + (36 * 0) + (12 * 0) = 144. ``` + We can see from section 2 of summary.result.rpt that the minimum period for the synthesized block is 4.632 ns, which is below the threshold of 6.353 ns from the demo design. This means we can safely integrate this @@ -3215,6 +3353,7 @@ an image, as shown in Figure 26: Side-by-side comparison of original (left) and Canny Edge Filtered (right) image. The left image is the original, and the right image is after running the Canny edge detection filter. +

Figure 26: Side-by-side comparison of original (top) and Canny Edge Filtered (left) images


@@ -3247,8 +3386,8 @@ dataflow pragma causes the four sub-functions to overlap their execution and is ideal for generating a design where multiple functions are connected to operate as a single pipeline. To learn more about the dataflow pragma, see the [SmartHLS -Documentation](https://onlinedocs.microchip.com/v2/keyword-lookup?keyword=hls_data_flow&redirect=true&version=latest -). +Documentation](https://onlinedocs.microchip.com/v2/keyword-lookup?keyword=hls_data_flow&redirect=true&version=latest). + ```c void canny(hls::FIFO &input_fifo, hls::FIFO &output_fifo) { @@ -3271,11 +3410,13 @@ void canny(hls::FIFO &input_fifo, hysteresis_filter(output_fifo_nm, output_fifo); } ``` + The testbench for the Canny design on line 108 is similar to the Gaussian Filter testbench, however this design has an extra software implementation to compare against the hardware optimized version. The testbench checks that the software output, hardware output and golden output are all equal during co-simulation. + ```c // output validation for (i = 0; i < HEIGHT; i++) { @@ -3301,12 +3442,14 @@ for (i = 0; i < HEIGHT; i++) { } } ``` + ![](.//media/image2.png)Now generate the hardware (![](.//media/image127.png)) and then open the summary.hls.canny.rpt file, go to section 2 and scroll to the right to verify that all of the four filter functions have an initiation interval of 1. As every filter in the top level function has an initiation interval of 1, the entire pipeline then has an initiation interval of 1 as well. + ``` ====== 2. Function and Loop Scheduling Results ====== @@ -3346,11 +3489,13 @@ pipeline then has an initiation interval of 1 as well. +-------------------------------------+----------------------------------+------------+-------------------+----+---------------+ ``` + ![](.//media/image2.png)Now we uncomment `FAST_COSIM` in `define.hpp`, save, then rerun SmartHLS to generate the hardware (![](.//media/image127.png)) and then run co-simulation with QuestaSim (![](.//media/image62.png)). You should see the following output in the Console: + ``` Retrieving hardware outputs from RTL simulation for canny function call 1. Result: 5600 @@ -3359,6 +3504,7 @@ RESULT: PASS Simulation time (cycles): 6,119 SW/HW co-simulation: PASS ``` + Notice that although the pipeline is longer for Canny, the cycle latency of the simulation is about the same as that of the pipelined Gaussian design. This is because extra latency in a pipeline with initiation @@ -3380,6 +3526,7 @@ click the `canny.cpp` source file. Notice on line 6 that canny now has 4 additional scalar inputs which represents the switch input that in turn goes to each filter. + ```c void canny(bool switch_0, bool switch_1, @@ -3406,9 +3553,11 @@ void canny(bool switch_0, hysteresis_filter(switch_3, output_fifo_nm, output_fifo); }  ``` + Inside of the functions, for example on line 37 of `hysteresis_filter.cpp` this switch is used to decide whether to pass through the pixel or apply filtering. + ```c // if filter is off, pass pixel through if (!on_switch) { @@ -3416,10 +3565,12 @@ if (!on_switch) { continue; } ``` + ![](.//media/image2.png)Run “Compile Software to Hardware” (click the ![](.//media/image39.png) button). Open the `summary.hls.canny.rpt` file and verify that there are now four more scalar interfaces for each of the switches in section 1. + ``` +-------------+--------------------+-------------+------------------+------------------+ | switch_0 | Scalar Argument | switch_0 | 1 | input | @@ -3431,8 +3582,10 @@ the switches in section 1. | switch_3 | Scalar Argument | switch_3 | 1 | input | +-------------+--------------------+-------------+------------------+------------------+ ``` + ![](.//media/image2.png)Synthesize to FPGA (![](.//media/image112.png)) and check the Fmax and resource usage. + ``` ====== 2. Timing Result of HLS-generated IP Core (top-level module: canny_top) ====== @@ -3460,6 +3613,7 @@ When the HLS core is integrated into a larger system, the system Fmax may be low +--------------------------+-------------------+--------+------------+ ``` + We can see from section 2 of `summary.result.rpt` that the minimum period for the synthesized block is 6.092 ns, which is below the threshold of 6.734 ns from the demo design. This means we can safely integrate this @@ -3472,56 +3626,56 @@ SmartHLS generated Canny Edge Detection block and import it into SmartDesign. This will showcase the flow for integrating SmartHLS generated Verilog Cores into Libero® SoC SmartDesign. -1. Open `define.hpp` in the `Canny_FIFO_Switch` project in the Project - Explorer and check that `FAST_COSIM` is commented out. The - functionality of this hardware block depends on knowing the `WIDTH` - and `HEIGHT` of the input image. -


+1. Open `define.hpp` in the `Canny_FIFO_Switch` project in the Project + Explorer and check that `FAST_COSIM` is commented out. The + functionality of this hardware block depends on knowing the `WIDTH` + and `HEIGHT` of the input image. +


+ ```c // uncomment this line to test on a smaller image for faster co-simulation //#define FAST_COSIM -``` -2. Click the “Compile Software to Hardware” button - ![](.//media/image80.png) on the top toolbar. - -3. Launch Libero® SoC 2024.2 and open the project: “`Libero_training1/Libero_training1.prjx`” - -4. Navigate to the Design Hierarchy and search for “canny”. Right click - the canny_top design component and select Unlink. This is to make - sure there are no duplicated blocks before importing the new - canny_top HDL+ block from SmartHLS. -


-5. Without clearing the search, double click the `LegUp_Image_Filters` - SmartDesign file to open it in the SmartDesign Canvas. Then find the - `canny_top_0` block which should now be missing and colored red. -


-


- -6. On the top toolbar, click Project-\>Execute Script... and run the - `create_hdl_plus.tcl` file from the `Canny_FIFO_Switch` SmartHLS - project directory which will import the new canny_top into the - design hierarchy. This will open a report window when it finishes. - Make sure there are no errors and close the report window. -


- -7. Right click on the `canny_top_0` component, select “Replace - Component…” and then replace it with the newly imported `canny_top`. - Make sure to check “Replace all instances using ‘canny_top’ with - new selected component.” -


+``` -8. After replacing the SmartDesign component, canny_top should no - longer be red as shown below. -


+2. Click the “Compile Software to Hardware” button + ![](.//media/image80.png) on the top toolbar. +3. Launch Libero® SoC 2024.2 and open the project: “`Libero_training1/Libero_training1.prjx`” +4. Navigate to the Design Hierarchy and search for “canny”. Right click + the canny_top design component and select Unlink. This is to make + sure there are no duplicated blocks before importing the new + canny_top HDL+ block from SmartHLS. + +


+5. Without clearing the search, double click the `LegUp_Image_Filters` + SmartDesign file to open it in the SmartDesign Canvas. Then find the + `canny_top_0` block which should now be missing and colored red. + +


+


+6. On the top toolbar, click Project-\>Execute Script... and run the + `create_hdl_plus.tcl` file from the `Canny_FIFO_Switch` SmartHLS + project directory which will import the new canny_top into the + design hierarchy. This will open a report window when it finishes. + Make sure there are no errors and close the report window. + +


+7. Right click on the `canny_top_0` component, select “Replace + Component…” and then replace it with the newly imported `canny_top`. + Make sure to check “Replace all instances using ‘canny_top’ with + new selected component.” -9. Click the “Generate Component” (![](.//media/image88.png)) button in - the SmartDesign toolbar for `LegUp_Image_Filters` and each parent - component (`video_pipelining`, `VIDEO_KIT_TOP`). +


-10. Go to the Design Flow tab and double click Generate FPGA Array Data. - This should take 1-2h to finish running. +8. After replacing the SmartDesign component, canny_top should no + longer be red as shown below. -11. The Mi-V soft processor receives configuration from the Video +


+9. Click the “Generate Component” (![](.//media/image88.png)) button in + the SmartDesign toolbar for `LegUp_Image_Filters` and each parent + component (`video_pipelining`, `VIDEO_KIT_TOP`). +10. Go to the Design Flow tab and double click Generate FPGA Array Data. + This should take 1-2h to finish running. +11. The Mi-V soft processor receives configuration from the Video Control GUI running on the PC via the USB-UART. The Mi-V uses this configuration to control the Image/Video Processing block. To program the executable that runs on the Mi-V, double click @@ -3534,6 +3688,7 @@ generated Verilog Cores into Libero® SoC SmartDesign. 12. Check the “Filter out Inferred RAMs” checkbox and look for `PROC_SUBSYSTEM_0/PF_SRAM_AHBL_AXI_C0_0`. It should be Logical Instance 8. Double click on it. +



@@ -3543,16 +3698,14 @@ generated Verilog Cores into Libero® SoC SmartDesign. Remember to select the “Use relative path” option when browsing to the memory file: +


14. Click OK, then click “Apply” in the Design and Memory Initialization tab. - 15. Under “Design Flow” double-click “Generate Bitstream”. - 16. With the same setup as [Programming and Running Design on the PolarFire® Kit](#programming-and-running-design-on-the-polarfire-kit), double click “Run PROGRAM Action” to program the board. - 17. You can also double-click “Export FlashPro Express job” to create an updated .job file.