Created
March 15, 2025 18:20
-
-
Save eightycc/478033e458a2fd6b7da95a5c4c082679 to your computer and use it in GitHub Desktop.
Benchmark XIP Flash, PSRAM, and XIP Streaming
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include "pico/flash.h" | |
#include "pico/stdlib.h" | |
#include "pico/rand.h" | |
#include "pico/time.h" | |
#include "hardware/dma.h" | |
#include "hardware/flash.h" | |
#include "hardware/sync.h" | |
#include "hardware/xip_cache.h" | |
#include "hardware/regs/addressmap.h" | |
#include "hardware/regs/qmi.h" | |
#include "hardware/regs/xip.h" | |
#include "hardware/structs/xip_ctrl.h" | |
#include "hardware/structs/qmi.h" | |
#include "hardware/structs/xip_ctrl.h" | |
#define BUF_SIZE (128*1024) | |
#define PICO_PSRAM_CHIP_SELECT 8 // Feather RP2350 GPIO pin number for PSRAM chip select | |
#define XIP_PSRAM_CACHED 0x11000000 | |
#define XIP_PSRAM_NOCACHE 0x15000000 | |
#define FLASH_TARGET_OFFSET (1024 * 1024) // +1MB should be safe to use | |
static volatile uint32_t random_buf[BUF_SIZE/4]; | |
static volatile uint32_t copy_buf[BUF_SIZE/4]; | |
static size_t _psram_size; | |
static uint8_t _psram_readid_response[8]; | |
// Activate PSRAM. (Copied from CircuitPython ports/raspberrypi/supervisor/port.c) | |
static void __no_inline_not_in_flash_func(setup_psram)(void) { | |
gpio_set_function(PICO_PSRAM_CHIP_SELECT, GPIO_FUNC_XIP_CS1); | |
_psram_size = 0; | |
uint32_t save_irq_status = save_and_disable_interrupts(); | |
// Try and read the PSRAM ID via direct_csr. | |
qmi_hw->direct_csr = 30 << QMI_DIRECT_CSR_CLKDIV_LSB | | |
QMI_DIRECT_CSR_EN_BITS; | |
// Need to poll for the cooldown on the last XIP transfer to expire | |
// (via direct-mode BUSY flag) before it is safe to perform the first | |
// direct-mode operation | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) { | |
} | |
// Exit out of QMI in case we've inited already | |
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS; | |
// Transmit as quad. | |
qmi_hw->direct_tx = QMI_DIRECT_TX_OE_BITS | | |
QMI_DIRECT_TX_IWIDTH_VALUE_Q << QMI_DIRECT_TX_IWIDTH_LSB | | |
0xf5; | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) { | |
} | |
(void)qmi_hw->direct_rx; | |
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS); | |
// Read the id | |
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS; | |
uint8_t kgd = 0; | |
uint8_t eid = 0; | |
for (size_t i = 0; i < 12; i++) { | |
if (i == 0) { | |
qmi_hw->direct_tx = 0x9f; | |
} else { | |
qmi_hw->direct_tx = 0xff; | |
} | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_TXEMPTY_BITS) == 0) { | |
} | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) { | |
} | |
// buffer read id response eliding the first 4 bytes (cmd + 24-bit addr) | |
if (i >= 4) { | |
_psram_readid_response[i - 4] = qmi_hw->direct_rx; | |
} else { | |
(void)qmi_hw->direct_rx; | |
} | |
if (i == 5) { | |
kgd = _psram_readid_response[i-4]; | |
} else if (i == 6) { | |
eid = _psram_readid_response[i-4]; | |
} | |
} | |
// Disable direct csr. | |
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS | QMI_DIRECT_CSR_EN_BITS); | |
if (kgd != 0x5D) { | |
restore_interrupts(save_irq_status); | |
return; | |
} | |
// Enable quad mode. | |
qmi_hw->direct_csr = 30 << QMI_DIRECT_CSR_CLKDIV_LSB | | |
QMI_DIRECT_CSR_EN_BITS; | |
// Need to poll for the cooldown on the last XIP transfer to expire | |
// (via direct-mode BUSY flag) before it is safe to perform the first | |
// direct-mode operation | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) { | |
} | |
// RESETEN, RESET and quad enable | |
for (uint8_t i = 0; i < 3; i++) { | |
qmi_hw->direct_csr |= QMI_DIRECT_CSR_ASSERT_CS1N_BITS; | |
if (i == 0) { | |
qmi_hw->direct_tx = 0x66; | |
} else if (i == 1) { | |
qmi_hw->direct_tx = 0x99; | |
} else { | |
qmi_hw->direct_tx = 0x35; | |
} | |
while ((qmi_hw->direct_csr & QMI_DIRECT_CSR_BUSY_BITS) != 0) { | |
} | |
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS); | |
for (size_t j = 0; j < 20; j++) { | |
asm ("nop"); | |
} | |
(void)qmi_hw->direct_rx; | |
} | |
// Disable direct csr. | |
qmi_hw->direct_csr &= ~(QMI_DIRECT_CSR_ASSERT_CS1N_BITS | QMI_DIRECT_CSR_EN_BITS); | |
// PSRAM timings with 150MHz SCK, 6.667ns per cycle. | |
// COOLDOWN: 2'b01 64 SCK = 426.7ns | |
// PAGEBREAK: 2'b10 break bursts at 1024-byte page boundaries | |
// reserved: 2'b00 | |
// SELECT_SETUP: 1'b0 0.5 SCK = 3.33ns | |
// SELECT_HOLD: 2'b00 1.0 SCK = 6.67ns | |
// MAX_SELECT: 6'b01_0000 16 x 64 SCK = 6.827us | |
// MIN_DESELECT: 4'b0111 7.5 x SCK = 50.0ns | |
// reserved: 1'b0 | |
// RXDELAY: 3'b001 0.5 SCK = 3.33ns | |
// CLKDIV: 8'b0000_0010 0.5 SCK = 3.33ns = 75MHz | |
qmi_hw->m[1].timing = | |
QMI_M0_TIMING_PAGEBREAK_VALUE_1024 << QMI_M0_TIMING_PAGEBREAK_LSB | // Break between pages. | |
3 << QMI_M0_TIMING_SELECT_HOLD_LSB | // Delay releasing CS for 3 extra system cycles. | |
1 << QMI_M0_TIMING_COOLDOWN_LSB | | |
1 << QMI_M0_TIMING_RXDELAY_LSB | | |
16 << QMI_M0_TIMING_MAX_SELECT_LSB | // In units of 64 system clock cycles. PSRAM says 8us max. 8 / 0.00752 / 64 = 16.62 | |
7 << QMI_M0_TIMING_MIN_DESELECT_LSB | // In units of system clock cycles. PSRAM says 50ns.50 / 7.52 = 6.64 | |
2 << QMI_M0_TIMING_CLKDIV_LSB; | |
qmi_hw->m[1].rfmt = (QMI_M0_RFMT_PREFIX_WIDTH_VALUE_Q << QMI_M0_RFMT_PREFIX_WIDTH_LSB | | |
QMI_M0_RFMT_ADDR_WIDTH_VALUE_Q << QMI_M0_RFMT_ADDR_WIDTH_LSB | | |
QMI_M0_RFMT_SUFFIX_WIDTH_VALUE_Q << QMI_M0_RFMT_SUFFIX_WIDTH_LSB | | |
QMI_M0_RFMT_DUMMY_WIDTH_VALUE_Q << QMI_M0_RFMT_DUMMY_WIDTH_LSB | | |
QMI_M0_RFMT_DUMMY_LEN_VALUE_24 << QMI_M0_RFMT_DUMMY_LEN_LSB | | |
QMI_M0_RFMT_DATA_WIDTH_VALUE_Q << QMI_M0_RFMT_DATA_WIDTH_LSB | | |
QMI_M0_RFMT_PREFIX_LEN_VALUE_8 << QMI_M0_RFMT_PREFIX_LEN_LSB | | |
QMI_M0_RFMT_SUFFIX_LEN_VALUE_NONE << QMI_M0_RFMT_SUFFIX_LEN_LSB); | |
qmi_hw->m[1].rcmd = 0xeb << QMI_M0_RCMD_PREFIX_LSB | | |
0 << QMI_M0_RCMD_SUFFIX_LSB; | |
qmi_hw->m[1].wfmt = (QMI_M0_WFMT_PREFIX_WIDTH_VALUE_Q << QMI_M0_WFMT_PREFIX_WIDTH_LSB | | |
QMI_M0_WFMT_ADDR_WIDTH_VALUE_Q << QMI_M0_WFMT_ADDR_WIDTH_LSB | | |
QMI_M0_WFMT_SUFFIX_WIDTH_VALUE_Q << QMI_M0_WFMT_SUFFIX_WIDTH_LSB | | |
QMI_M0_WFMT_DUMMY_WIDTH_VALUE_Q << QMI_M0_WFMT_DUMMY_WIDTH_LSB | | |
QMI_M0_WFMT_DUMMY_LEN_VALUE_NONE << QMI_M0_WFMT_DUMMY_LEN_LSB | | |
QMI_M0_WFMT_DATA_WIDTH_VALUE_Q << QMI_M0_WFMT_DATA_WIDTH_LSB | | |
QMI_M0_WFMT_PREFIX_LEN_VALUE_8 << QMI_M0_WFMT_PREFIX_LEN_LSB | | |
QMI_M0_WFMT_SUFFIX_LEN_VALUE_NONE << QMI_M0_WFMT_SUFFIX_LEN_LSB); | |
qmi_hw->m[1].wcmd = 0x38 << QMI_M0_WCMD_PREFIX_LSB | | |
0 << QMI_M0_WCMD_SUFFIX_LSB; | |
restore_interrupts(save_irq_status); | |
_psram_size = 1024 * 1024; // 1 MiB | |
uint8_t size_id = eid >> 5; | |
if (eid == 0x26 || size_id == 2) { | |
_psram_size *= 8; | |
} else if (size_id == 0) { | |
_psram_size *= 2; | |
} else if (size_id == 1) { | |
_psram_size *= 4; | |
} | |
// Mark that we can write to PSRAM. | |
xip_ctrl_hw->ctrl |= XIP_CTRL_WRITABLE_M1_BITS; | |
// Test write to the PSRAM. | |
volatile uint32_t *psram_nocache = (volatile uint32_t *)XIP_PSRAM_NOCACHE; | |
psram_nocache[0] = 0x12345678; | |
volatile uint32_t readback = psram_nocache[0]; | |
if (readback != 0x12345678) { | |
_psram_size = 0; | |
return; | |
} | |
} | |
// Flash erase and program function trampolines | |
static void call_flash_range_erase(void *param) { | |
uint32_t offset = ((uintptr_t*)param)[0]; | |
size_t len = ((uintptr_t*)param)[1]; | |
flash_range_erase(offset, len); | |
} | |
static void call_flash_range_program(void *param) { | |
uint32_t offset = ((uintptr_t*)param)[0]; | |
const uint8_t *data = (const uint8_t *)((uintptr_t*)param)[1]; | |
size_t len = ((uintptr_t*)param)[2]; | |
flash_range_program(offset, data, len); | |
} | |
static bool verify_buffer(const uint32_t *src, const uint32_t *dst, size_t len) { | |
for (size_t i = 0; i < len / 4; i++) { | |
if (src[i] != dst[i]) { | |
printf("Buffer mismatch at index %d: %08x != %08x\n", i, src[i], dst[i]); | |
return false; | |
} | |
} | |
return true; | |
} | |
static uint32_t time_copy_buffer(const uint32_t *src, uint32_t *dst, size_t len) { | |
uint32_t start_time = time_us_32(); | |
for (int i = 0; i < (len / 4); i++) { | |
dst[i] = src[i]; | |
} | |
uint32_t end_time = time_us_32(); | |
return end_time - start_time; | |
} | |
static uint32_t time_xip_stream(const uint32_t *src, uint32_t *dst, size_t len) { | |
// Set up XIP to stream data from flash or PSRAM to SRAM via DMA channel 0 | |
while (!(xip_ctrl_hw->stat & XIP_STAT_FIFO_EMPTY)) | |
(void) xip_ctrl_hw->stream_fifo; | |
xip_ctrl_hw->stream_addr = (uint32_t)src; | |
// NOTE: stream_ctr counts 32-bit words | |
xip_ctrl_hw->stream_ctr = len / 4; | |
// Configure DMA channel 0 to stream from XIP | |
const uint dma_chan = 0; | |
dma_channel_config cfg = dma_channel_get_default_config(dma_chan); | |
channel_config_set_read_increment(&cfg, false); | |
channel_config_set_write_increment(&cfg, true); | |
channel_config_set_dreq(&cfg, DREQ_XIP_STREAM); | |
uint32_t start_time = time_us_32(); | |
dma_channel_configure( | |
dma_chan, | |
&cfg, | |
(void *) dst, // Write addr | |
(const void *) XIP_AUX_BASE, // Read addr | |
len / 4, // Transfer count | |
true // Start immediately! | |
); | |
//printf("DMA channel %d started\n", dma_chan); | |
dma_channel_wait_for_finish_blocking(dma_chan); | |
uint32_t end_time = time_us_32(); | |
return end_time - start_time; | |
} | |
int main() | |
{ | |
stdio_init_all(); | |
sleep_ms(1000); // Allow time to start USB terminal | |
setup_psram(); | |
printf("PSRAM size: %d bytes\n", _psram_size); | |
printf("PSRAM read ID: %02x %02x %02x %02x %02x %02x %02x %02x\n", | |
_psram_readid_response[0], _psram_readid_response[1], _psram_readid_response[2], _psram_readid_response[3], | |
_psram_readid_response[4], _psram_readid_response[5], _psram_readid_response[6], _psram_readid_response[7]); | |
printf("Buffer size: %d bytes\n", BUF_SIZE); | |
// Fill an SRAM buffer with a random pattern | |
uint32_t start_time = time_us_32(); | |
for (int i = 0; i < (BUF_SIZE / 4); i++) { | |
((uint32_t *)random_buf)[i] = get_rand_32(); | |
} | |
uint32_t end_time = time_us_32(); | |
printf("SRAM buffer random filled in %d us\n", end_time - start_time); | |
// Flash random buffer and verify | |
printf("Erasing random buffer in flash\n"); | |
uintptr_t flash_erase_params[] = {FLASH_TARGET_OFFSET, BUF_SIZE}; | |
int rc = flash_safe_execute(call_flash_range_erase, flash_erase_params, UINT32_MAX); | |
if (rc < 0) { | |
printf("Flash range erase failed: %d\n", rc); | |
return -1; | |
} | |
printf("Programming random buffer to flash\n"); | |
uintptr_t flash_program_params[] = {FLASH_TARGET_OFFSET, (uintptr_t)random_buf, BUF_SIZE}; | |
rc = flash_safe_execute(call_flash_range_program, flash_program_params, UINT32_MAX); | |
if (rc < 0) { | |
printf("Flash range program failed: %d\n", rc); | |
return -1; | |
} | |
printf("Verifying random flash buffer\n"); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)(XIP_BASE + FLASH_TARGET_OFFSET), BUF_SIZE / 4)) { | |
printf("Flash buffer verification failed\n"); | |
} | |
// Copy SRAM -> SRAM and verify | |
uint32_t copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)copy_buf, BUF_SIZE); | |
printf("SRAM -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy Flash(cached) -> SRAM and verify | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_BASE + FLASH_TARGET_OFFSET, BUF_SIZE); | |
printf("Flash(cached) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy Flash(no cache) -> SRAM and verify | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_NOCACHE_NOALLOC_BASE + FLASH_TARGET_OFFSET, BUF_SIZE); | |
printf("Flash(no cache) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy SRAM -> PSRAM(cached) and verify | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_CACHED, BUF_SIZE); | |
printf("SRAM -> PSRAM(cached) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_CACHED, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy SRAM -> PSRAM(no cache) and verify | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
printf("SRAM -> PSRAM(no cache) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy PSRAM(cached) -> SRAM and verify | |
xip_cache_clean_all(); | |
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_CACHED, (uint32_t *)copy_buf, BUF_SIZE); | |
printf("PSRAM(cached) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy PSRAM(no cache) -> SRAM and verify | |
xip_cache_clean_all(); | |
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_NOCACHE, (uint32_t *)copy_buf, BUF_SIZE); | |
printf("PSRAM(no cache) -> SRAM copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy PSRAM(cached) -> PSRAM(cached) and verify | |
xip_cache_clean_all(); | |
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_CACHED, (uint32_t *)XIP_PSRAM_CACHED+BUF_SIZE, BUF_SIZE); | |
printf("PSRAM(cached) -> PSRAM(cached) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_CACHED+BUF_SIZE, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy PSRAM(no cache) -> PSRAM(no cache) and verify | |
xip_cache_clean_all(); | |
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
xip_cache_clean_all(); | |
copy_time = time_copy_buffer((const uint32_t *)XIP_PSRAM_NOCACHE, (uint32_t *)XIP_PSRAM_NOCACHE+BUF_SIZE, BUF_SIZE); | |
printf("PSRAM(no cache) -> PSRAM(no cache) copied in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)XIP_PSRAM_NOCACHE+BUF_SIZE, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy Flash(no cache) -> SRAM via XIP stream and verify | |
xip_cache_clean_all(); | |
copy_time = time_xip_stream((const uint32_t *)(XIP_NOCACHE_NOALLOC_BASE + FLASH_TARGET_OFFSET), (uint32_t *)copy_buf, BUF_SIZE); | |
printf("Flash(no cache) -> SRAM via XIP stream in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
// Copy PSRAM(no cache) -> SRAM via XIP stream and verify | |
xip_cache_clean_all(); | |
time_copy_buffer((const uint32_t *)random_buf, (uint32_t *)XIP_PSRAM_NOCACHE, BUF_SIZE); | |
xip_cache_clean_all(); | |
copy_time = time_xip_stream((const uint32_t *)(XIP_PSRAM_NOCACHE), (uint32_t *)copy_buf, BUF_SIZE); | |
printf("PSRAM(no cache) -> SRAM via XIP stream in %d us, %d ns/word\n", copy_time, (copy_time * 1000) / (BUF_SIZE / 4)); | |
if (!verify_buffer((const uint32_t *)random_buf, (const uint32_t *)copy_buf, BUF_SIZE)) { | |
printf("Buffer verification failed\n"); | |
} | |
printf("\n\n"); | |
sleep_ms(1000); // Allow time to flush output | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Results run on Adafruit RP2350 Feather: