Skip to content

Instantly share code, notes, and snippets.

@eightycc
Created March 15, 2025 18:20
Show Gist options
  • Save eightycc/478033e458a2fd6b7da95a5c4c082679 to your computer and use it in GitHub Desktop.
Save eightycc/478033e458a2fd6b7da95a5c4c082679 to your computer and use it in GitHub Desktop.
Benchmark XIP Flash, PSRAM, and XIP Streaming
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "pico/flash.h"
#include "pico/stdlib.h"
#include "pico/rand.h"
#include "pico/time.h"
#include "hardware/dma.h"
#include "hardware/flash.h"
#include "hardware/sync.h"
#include "hardware/xip_cache.h"
#include "hardware/regs/addressmap.h"
#include "hardware/regs/qmi.h"
#include "hardware/regs/xip.h"
#include "hardware/structs/xip_ctrl.h"
#include "hardware/structs/qmi.h"
#include "hardware/structs/xip_ctrl.h"
#define BUF_SIZE (128*1024)
#define PICO_PSRAM_CHIP_SELECT 8 // Feather RP2350 GPIO pin number for PSRAM chip select
#define XIP_PSRAM_CACHED 0x11000000
#define XIP_PSRAM_NOCACHE 0x15000000
#define FLASH_TARGET_OFFSET (1024 * 1024) // +1MB should be safe to use
static volatile uint32_t random_buf[BUF_SIZE/4];
static volatile uint32_t copy_buf[BUF_SIZE/4];
static size_t _psram_size;
static uint8_t _psram_readid_response[8];
// Activate PSRAM. (Copied from CircuitPython ports/raspberrypi/supervisor/port.c)
static void __no_inline_not_in_flash_func(setup_psram)(void) {
gpio_set_function(PICO_PSRAM_CHIP_SELECT, GPIO_FUNC_XIP_CS1);
_psram_size = 0;
uint32_t save_irq_status = save_and_disable_interrupts();
// Try and read the PSRAM ID via direct_csr.
qmi_hw->direct_csr = 30 << QMI_DIRECT_CSR_CLKDIV_LSB |
QMI_DIRECT_CSR_EN_BITS;
// Need to poll for the cooldown on the last XIP transfer to expire
// (via direct-mode BUSY flag) before it is safe to perform the first
// direct-mode operation
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) {
}
// Exit out of QMI in case we've inited already
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS;
// Transmit as quad.
qmi_hw->direct_tx = QMI_DIRECT_TX_OE_BITS |
QMI_DIRECT_TX_IWIDTH_VALUE_Q << QMI_DIRECT_TX_IWIDTH_LSB |
0xf5;
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) {
}
(void)qmi_hw->direct_rx;
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS);
// Read the id
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS;
uint8_t kgd = 0;
uint8_t eid = 0;
for (size_t i = 0; i < 12; i++) {
if (i == 0) {
qmi_hw->direct_tx = 0x9f;
} else {
qmi_hw->direct_tx = 0xff;
}
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_TXEMPTY_BITS) == 0) {
}
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) {
}
// buffer read id response eliding the first 4 bytes (cmd + 24-bit addr)
if (i >= 4) {
_psram_readid_response[i - 4] = qmi_hw->direct_rx;
} else {
(void)qmi_hw->direct_rx;
}
if (i == 5) {
kgd = _psram_readid_response[i-4];
} else if (i == 6) {
eid = _psram_readid_response[i-4];
}
}
// Disable direct csr.
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS | QMI_DIRECT_CSR_EN_BITS);
if (kgd != 0x5D) {
restore_interrupts(save_irq_status);
return;
}
// Enable quad mode.
qmi_hw->direct_csr = 30 << QMI_DIRECT_CSR_CLKDIV_LSB |
QMI_DIRECT_CSR_EN_BITS;
// Need to poll for the cooldown on the last XIP transfer to expire
// (via direct-mode BUSY flag) before it is safe to perform the first
// direct-mode operation
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) {
}
// RESETEN, RESET and quad enable
for (uint8_t i = 0; i < 3; i++) {
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS;
if (i == 0) {
qmi_hw->direct_tx = 0x66;
} else if (i == 1) {
qmi_hw->direct_tx = 0x99;
} else {
qmi_hw->direct_tx = 0x35;
}
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) {
}
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS);
for (size_t j = 0; j < 20; j++) {
asm ("nop");
}
(void)qmi_hw->direct_rx;
}
// Disable direct csr.
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS | QMI_DIRECT_CSR_EN_BITS);
// PSRAM timings with 150MHz SCK, 6.667ns per cycle.
// COOLDOWN: 2'b01 64 SCK = 426.7ns
// PAGEBREAK: 2'b10 break bursts at 1024-byte page boundaries
// reserved: 2'b00
// SELECT_SETUP: 1'b0 0.5 SCK = 3.33ns
// SELECT_HOLD: 2'b00 1.0 SCK = 6.67ns
// MAX_SELECT: 6'b01_0000 16 x 64 SCK = 6.827us
// MIN_DESELECT: 4'b0111 7.5 x SCK = 50.0ns
// reserved: 1'b0
// RXDELAY: 3'b001 0.5 SCK = 3.33ns
// CLKDIV: 8'b0000_0010 0.5 SCK = 3.33ns = 75MHz
qmi_hw->m[1].timing =
QMI_M0_TIMING_PAGEBREAK_VALUE_1024 << QMI_M0_TIMING_PAGEBREAK_LSB | // Break between pages.
3 << QMI_M0_TIMING_SELECT_HOLD_LSB | // Delay releasing CS for 3 extra system cycles.
1 << QMI_M0_TIMING_COOLDOWN_LSB |
1 << QMI_M0_TIMING_RXDELAY_LSB |
16 << QMI_M0_TIMING_MAX_SELECT_LSB | // In units of 64 system clock cycles. PSRAM says 8us max. 8 / 0.00752 / 64 = 16.62
7 << QMI_M0_TIMING_MIN_DESELECT_LSB | // In units of system clock cycles. PSRAM says 50ns.50 / 7.52 = 6.64
2 << QMI_M0_TIMING_CLKDIV_LSB;
qmi_hw->m[1].rfmt = (QMI_M0_RFMT_PREFIX_WIDTH_VALUE_Q << QMI_M0_RFMT_PREFIX_WIDTH_LSB |
QMI_M0_RFMT_ADDR_WIDTH_VALUE_Q << QMI_M0_RFMT_ADDR_WIDTH_LSB |
QMI_M0_RFMT_SUFFIX_WIDTH_VALUE_Q << QMI_M0_RFMT_SUFFIX_WIDTH_LSB |
QMI_M0_RFMT_DUMMY_WIDTH_VALUE_Q << QMI_M0_RFMT_DUMMY_WIDTH_LSB |
QMI_M0_RFMT_DUMMY_LEN_VALUE_24 << QMI_M0_RFMT_DUMMY_LEN_LSB |
QMI_M0_RFMT_DATA_WIDTH_VALUE_Q << QMI_M0_RFMT_DATA_WIDTH_LSB |
QMI_M0_RFMT_PREFIX_LEN_VALUE_8 << QMI_M0_RFMT_PREFIX_LEN_LSB |
QMI_M0_RFMT_SUFFIX_LEN_VALUE_NONE << QMI_M0_RFMT_SUFFIX_LEN_LSB);
qmi_hw->m[1].rcmd = 0xeb << QMI_M0_RCMD_PREFIX_LSB |
0 << QMI_M0_RCMD_SUFFIX_LSB;
qmi_hw->m[1].wfmt = (QMI_M0_WFMT_PREFIX_WIDTH_VALUE_Q << QMI_M0_WFMT_PREFIX_WIDTH_LSB |
QMI_M0_WFMT_ADDR_WIDTH_VALUE_Q << QMI_M0_WFMT_ADDR_WIDTH_LSB |
QMI_M0_WFMT_SUFFIX_WIDTH_VALUE_Q << QMI_M0_WFMT_SUFFIX_WIDTH_LSB |
QMI_M0_WFMT_DUMMY_WIDTH_VALUE_Q << QMI_M0_WFMT_DUMMY_WIDTH_LSB |
QMI_M0_WFMT_DUMMY_LEN_VALUE_NONE << QMI_M0_WFMT_DUMMY_LEN_LSB |
QMI_M0_WFMT_DATA_WIDTH_VALUE_Q << QMI_M0_WFMT_DATA_WIDTH_LSB |
QMI_M0_WFMT_PREFIX_LEN_VALUE_8 << QMI_M0_WFMT_PREFIX_LEN_LSB |
QMI_M0_WFMT_SUFFIX_LEN_VALUE_NONE << QMI_M0_WFMT_SUFFIX_LEN_LSB);
qmi_hw->m[1].wcmd = 0x38 << QMI_M0_WCMD_PREFIX_LSB |
0 << QMI_M0_WCMD_SUFFIX_LSB;
restore_interrupts(save_irq_status);
_psram_size = 1024 * 1024; // 1 MiB
uint8_t size_id = eid >> 5;
if (eid == 0x26 || size_id == 2) {
_psram_size *= 8;
} else if (size_id == 0) {
_psram_size *= 2;
} else if (size_id == 1) {
_psram_size *= 4;
}
// Mark that we can write to PSRAM.
xip_ctrl_hw->ctrl |= XIP_CTRL_WRITABLE_M1_BITS;
// Test write to the PSRAM.
volatile uint32_t *psram_nocache = (volatile uint32_t *)XIP_PSRAM_NOCACHE;
psram_nocache[0] = 0x12345678;
volatile uint32_t readback = psram_nocache[0];
if (readback != 0x12345678) {
_psram_size = 0;
return;
}
}
// Flash erase and program function trampolines
static void call_flash_range_erase(void *param) {
uint32_t offset = ((uintptr_t*)param)[0];
size_t len = ((uintptr_t*)param)[1];
flash_range_erase(offset, len);
}
static void call_flash_range_program(void *param) {
uint32_t offset = ((uintptr_t*)param)[0];
const uint8_t *data = (const uint8_t *)((uintptr_t*)param)[1];
size_t len = ((uintptr_t*)param)[2];
flash_range_program(offset, data, len);
}
static bool verify_buffer(const uint32_t *src, const uint32_t *dst, size_t len) {
for (size_t i = 0; i < len / 4; i++) {
if (src[i] != dst[i]) {
printf("Buffer mismatch at index %d: %08x != %08x\n", i, src[i], dst[i]);
return false;
}
}
return true;
}
static uint32_t time_copy_buffer(const uint32_t *src, uint32_t *dst, size_t len) {
uint32_t start_time = time_us_32();
for (int i = 0; i < (len / 4); i++) {
dst[i] = src[i];
}
uint32_t end_time = time_us_32();
return end_time - start_time;
}
static uint32_t time_xip_stream(const uint32_t *src, uint32_t *dst, size_t len) {
// Set up XIP to stream data from flash or PSRAM to SRAM via DMA channel 0
while (!(xip_ctrl_hw->stat & XIP_STAT_FIFO_EMPTY))
(void) xip_ctrl_hw->stream_fifo;
xip_ctrl_hw->stream_addr = (uint32_t)src;
// NOTE: stream_ctr counts 32-bit words
xip_ctrl_hw->stream_ctr = len / 4;
// Configure DMA channel 0 to stream from XIP
const uint dma_chan = 0;
dma_channel_config cfg = dma_channel_get_default_config(dma_chan);
channel_config_set_read_increment(&cfg, false);
channel_config_set_write_increment(&cfg, true);
channel_config_set_dreq(&cfg, DREQ_XIP_STREAM);
uint32_t start_time = time_us_32();
dma_channel_configure(
dma_chan,
&cfg,
(void *) dst, // Write addr
(const void *) XIP_AUX_BASE, // Read addr
len / 4, // Transfer count
true // Start immediately!
);
//printf("DMA channel %d started\n", dma_chan);
dma_channel_wait_for_finish_blocking(dma_chan);
uint32_t end_time = time_us_32();
return end_time - start_time;
}
int main()
{
stdio_init_all();
sleep_ms(1000); // Allow time to start USB terminal
setup_psram();
printf("PSRAM size: %d bytes\n", _psram_size);
printf("PSRAM read ID: %02x %02x %02x %02x %02x %02x %02x %02x\n",
_psram_readid_response[0], _psram_readid_response[1], _psram_readid_response[2], _psram_readid_response[3],
_psram_readid_response[4], _psram_readid_response[5], _psram_readid_response[6], _psram_readid_response[7]);
printf("Buffer size: %d bytes\n", BUF_SIZE);
// Fill an SRAM buffer with a random pattern
uint32_t start_time = time_us_32();
for (int i = 0; i < (BUF_SIZE / 4); i++) {
((uint32_t *)random_buf)[i] = get_rand_32();
}
uint32_t end_time = time_us_32();
printf("SRAM buffer random filled in %d us\n", end_time - start_time);
// Flash random buffer and verify
printf("Erasing random buffer in flash\n");
uintptr_t flash_erase_params[] = {FLASH_TARGET_OFFSET, BUF_SIZE};
int rc = flash_safe_execute(call_flash_range_erase, flash_erase_params, UINT32_MAX);
if (rc < 0) {
printf("Flash range erase failed: %d\n", rc);
return -1;
}
printf("Programming random buffer to flash\n");
uintptr_t flash_program_params[] = {FLASH_TARGET_OFFSET, (uintptr_t)random_buf, BUF_SIZE};
rc = flash_safe_execute(call_flash_range_program, flash_program_params, UINT32_MAX);
if (rc < 0) {
printf("Flash range program failed: %d\n", rc);
return -1;
}
printf("Verifying random flash buffer\n");
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)(XIP_BASE + FLASH_TARGET_OFFSET), BUF_SIZE / 4)) {
printf("Flash buffer verification failed\n");
}
// Copy SRAM -> SRAM and verify
uint32_t copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)copy_buf, BUF_SIZE);
printf("SRAM -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy Flash(cached) -> SRAM and verify
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_BASE + FLASH_TARGET_OFFSET, BUF_SIZE);
printf("Flash(cached) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy Flash(no cache) -> SRAM and verify
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_NOCACHE_NOALLOC_BASE + FLASH_TARGET_OFFSET, BUF_SIZE);
printf("Flash(no cache) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy SRAM -> PSRAM(cached) and verify
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_CACHED, BUF_SIZE);
printf("SRAM -> PSRAM(cached) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_CACHED, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy SRAM -> PSRAM(no cache) and verify
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
printf("SRAM -> PSRAM(no cache) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy PSRAM(cached) -> SRAM and verify
xip_cache_clean_all();
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_CACHED, (uint32_t *)copy_buf, BUF_SIZE);
printf("PSRAM(cached) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy PSRAM(no cache) -> SRAM and verify
xip_cache_clean_all();
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_NOCACHE, (uint32_t *)copy_buf, BUF_SIZE);
printf("PSRAM(no cache) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy PSRAM(cached) -> PSRAM(cached) and verify
xip_cache_clean_all();
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_CACHED, (uint32_t *)XIP_PSRAM_CACHED+BUF_SIZE, BUF_SIZE);
printf("PSRAM(cached) -> PSRAM(cached) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_CACHED+BUF_SIZE, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy PSRAM(no cache) -> PSRAM(no cache) and verify
xip_cache_clean_all();
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
xip_cache_clean_all();
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_NOCACHE, (uint32_t *)XIP_PSRAM_NOCACHE+BUF_SIZE, BUF_SIZE);
printf("PSRAM(no cache) -> PSRAM(no cache) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_NOCACHE+BUF_SIZE, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy Flash(no cache) -> SRAM via XIP stream and verify
xip_cache_clean_all();
copy_time = time_xip_stream((const uint32_t *)(XIP_NOCACHE_NOALLOC_BASE + FLASH_TARGET_OFFSET), (uint32_t *)copy_buf, BUF_SIZE);
printf("Flash(no cache) -> SRAM via XIP stream in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
// Copy PSRAM(no cache) -> SRAM via XIP stream and verify
xip_cache_clean_all();
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE);
xip_cache_clean_all();
copy_time = time_xip_stream((const uint32_t *)(XIP_PSRAM_NOCACHE), (uint32_t *)copy_buf, BUF_SIZE);
printf("PSRAM(no cache) -> SRAM via XIP stream in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4));
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) {
printf("Buffer verification failed\n");
}
printf("\n\n");
sleep_ms(1000); // Allow time to flush output
return 0;
}
@eightycc
Copy link
Author

Results run on Adafruit RP2350 Feather:

PSRAM size: 8388608 bytes
PSRAM read ID: 0d 5d 52 f3 da b0 66 33
Buffer size: 131072 bytes
SRAM buffer random filled in 85379 us
Erasing random buffer in flash
Programming random buffer to flash
Verifying random flash buffer
SRAM -> SRAM copied in 512 us, 15 ns/word
Flash(cached) -> SRAM copied in 6736 us, 205 ns/word
Flash(no cache) -> SRAM copied in 6938 us, 211 ns/word
SRAM -> PSRAM(cached) copied in 14438 us, 440 ns/word
SRAM -> PSRAM(no cache) copied in 4816 us, 146 ns/word
PSRAM(cached) -> SRAM copied in 5085 us, 155 ns/word
PSRAM(no cache) -> SRAM copied in 5309 us, 162 ns/word
PSRAM(cached) -> PSRAM(cached) copied in 29631 us, 904 ns/word
PSRAM(no cache) -> PSRAM(no cache) copied in 22102 us, 674 ns/word
Flash(no cache) -> SRAM via XIP stream in 5740 us, 175 ns/word
PSRAM(no cache) -> SRAM via XIP stream in 4100 us, 125 ns/word

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment