DanielTimLee/code_tracing.md

코드 분석

RPS 패치 내용: https://github.com/torvalds/linux/commit/0a9627f2649a02bea165cfd529d7bcb625c2fcad

RFS 패치 내용: https://github.com/torvalds/linux/commit/fec5e652e58fa6017b2c9e06466cb2a6538de5b4

1. sock.h

/*
 * hash 값을 parameter로 받아서, 그에 따라 sock_flow_table을 설정한다
 * sock structure 외부에서 추적 flow를 추적하는 경우, table을 채울 수 있다
 *
 * take a hash value as an argument and sets the sock_flow_table accordingly
 * This allows the table to be populated in cases
 * where flow is being tracked outside of a sock structure.
 *
 * ref. https://github.com/torvalds/linux/commit/fe47755852d1f299b55a6e6594bb6e082ac103d4
 */
static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
    struct rps_sock_flow_table *sock_flow_table;

    // rps_sock_flow_table을 읽어들여서 어떤 작업을 처리한다
    rcu_read_lock();
    sock_flow_table = rcu_dereference(rps_sock_flow_table);
    rps_record_sock_flow(sock_flow_table, hash);    // netdevice.h 참고
    rcu_read_unlock();
#endif
}

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
    /* static_key를 parameter로 받아서,
     * static_key_count(static_key)가 0보다 크면 true, 아닌 경우에는 false로 반환한다
     *
     * RFS가 필요한지 아닌지 확인한다
     */

    // RFS가 필요하다면..
    if (static_key_false(&rfs_needed)) {   
        /* Reading sk->sk_rxhash might incur an expensive cache line
         * miss.
         *
         * TCP_ESTABLISHED does cover almost all states where RFS
         * might be useful, and is cheaper [1] than testing :
         *    IPv4: inet_sk(sk)->inet_daddr
         *     IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
         * OR    an additional socket flag
         * [1] : sk_state and sk_prot are in the same cache line.
         *
         * sk->sk_rxhash를 읽으면 expensive cache line miss가 발생할 수 있다
         * 
         * TCP_ESTABLISHED는 RFS가 유용한 거의 모든 상태를 포함한다
         * sk_state와 sk_prot은 같은 cache line을 사용하지 않는다
         */
        
        // socket 상태 확인 -> TCP_ESTABLISHED 상태라면..
        if (sk->sk_state == TCP_ESTABLISHED)
            sock_rps_record_flow_hash(sk->sk_rxhash);
    }
#endif
}

static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
    // sk->sk_rxhash와 skb->hash가 같지 않다면
    if (unlikely(sk->sk_rxhash != skb->hash))
        sk->sk_rxhash = skb->hash;    // sk->sk_rxhash를 skb->hash로 업데이트
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
    // sk_rxhash reset..
    sk->sk_rxhash = 0;
#endif
}

...

2. netdevice.h

...

#ifdef CONFIG_RPS
// RPS, RFS 설정 관련..?

#include <linux/static_key.h>
extern struct static_key rps_needed;
extern struct static_key rfs_needed;
#endif

...

#ifdef CONFIG_RPS
/*
 * This structure holds an RPS map which can be of variable length.
 * The map is an array of CPUs.
 *
 * 가변 길이 RPS 맵 구조 -> CPU의 배열로 구성
 */
struct rps_map {
    unsigned int len;
    struct rcu_head rcu;
    u16 cpus[0];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))    // RPS map 크기 설정

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, and
 * a hardware filter index.
 *
 * CPU에 대한 flow mapping table
 */
struct rps_dev_flow {
    u16 cpu;
    u16 filter;
    unsigned int last_qtail;    // CPU의 input queue에 마지막으로 enqueue된 tail pointer
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
    unsigned int mask;
    struct rcu_head rcu;
    struct rps_dev_flow flows[0];    // flow mapping table 포함!
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))    // RPS dev flow table 크기 설정

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
 * Each entry is a 32bit value. Upper part is the high-order bits
 * of flow hash, lower part is CPU number.
 * rps_cpu_mask is used to partition the space, depending on number of
 * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
 * meaning we use 32-6=26 bits for the hash.
 *
 * rps_sock_flow_table에는 application에 의해 처리된
 * 마지막 CPU에 대한 flow mapping이 저장된다
 */
struct rps_sock_flow_table {
    u32    mask;

    u32    ents[0] ____cacheline_aligned_in_smp;
};
#define    RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))    // RPS sock flow table 크기 설정

#define RPS_NO_CPU 0xffff    // 32bit, 1111 1111 1111 1111

extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash)
{
    // table과 hash가 존재한다면..
    if (table && hash) {
        unsigned int index = hash & table->mask;    // hash를 table의 mask로 마스킹해서 index 계산
        u32 val = hash & ~rps_cpu_mask;    // hash를 ~rps_cpu_mask로 마스킹해서 val 계산

        /* We only give a hint, preemption can change CPU under us
         *
         * raw_smp_processor_id()는 현재 thread_info structure에서 cpu id를 가져온다..?
         * (raw_smp_processor_id X) smp_processor_id()는 현재 프로세스가 실행중인 CPU를 가져온다..?
         *
         * > Accroding to the define in the smp.h file,
         * the smp_processor_id() macro calls raw_smp_processor_id()...?
         * > https://www.spinics.net/lists/arm-kernel/msg72896.html
         */
        val |= raw_smp_processor_id();    // val에 현재 cpu id를 or 연산해준다..

        // table의 index번째 entry 값이 val과 같지 않다면..
        if (table->ents[index] != val)
            table->ents[index] = val;    // index번째 entry의 값을 val로 업데이트 한다
    }
}

#ifdef CONFIG_RFS_ACCEL
// rps가 flow을 expire 시킬 것인지 아닌지 boolean으로 정하는듯?
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
             u16 filter_id);
#endif
#endif /* CONFIG_RPS */

...

/* 
 * This structure contains an instance of an RX queue.
 * 
 * rx queue의 instance를 저장한다
 * 각 rx queue에는 RPS map과 RPS dev flow table이 존재한다
 */
struct netdev_rx_queue {
#ifdef CONFIG_RPS
    struct rps_map __rcu        *rps_map;    // RPS map 정보 저장
    struct rps_dev_flow_table __rcu    *rps_flow_table;    // RPA dev flow table 저장
#endif
    struct kobject            kobj;    // kobject는 device들을 객체로 추상화된 정보들을 갖는다
    struct net_device        *dev;
    struct xdp_rxq_info        xdp_rxq;
} ____cacheline_aligned_in_smp;

...

struct softnet_data {
    struct list_head    poll_list;
    struct sk_buff_head    process_queue;

    /* stats */
    unsigned int        processed;
    unsigned int        time_squeeze;
    unsigned int        received_rps;
#ifdef CONFIG_RPS
    /*
     * Inter Processor Interrupts (IPI)
     * SMP 시스템에서 하나의 CPU에서 다른 CPU로 발생시키는 인터럽트
     * CPU간 통신을 하고 싶어서 사용하는 것이 주 목적
     * CPU의 부하를 점검해서 일을 덜하고 있는 다른 CPU에 일을 시키는 network device driver 등에서 사용
     *
     * 따라서, rps와 관련하여 프로세서간 인터럽트 정보를 list형태로 저장하는 것 같은데,
     * 자료형이 softnet_data라서.. 음...softnet쪽도 조금 살펴봐야할 거 같다는 생각도 드네요
     */
    struct softnet_data    *rps_ipi_list;
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
    struct sd_flow_limit __rcu *flow_limit;
#endif
    struct Qdisc        *output_queue;
    struct Qdisc        **output_queue_tailp;
    struct sk_buff        *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
    struct sk_buff_head    xfrm_backlog;
#endif
#ifdef CONFIG_RPS
    /* input_queue_head should be written by cpu owning this struct,
     * and only read by other cpus. Worth using a cache line.
     *
     * input_queue_head에 값을 쓰는 작업은 이 구조체를 소유한 CPU가 진행해야 한다
     * 다른 CPU들은 input_queue_head에 읽기 작업만 수행할 수 있다
     */
    unsigned int        input_queue_head ____cacheline_aligned_in_smp;

    /* 
     * Elements below can be accessed between CPUs for RPS/RFS
     *
     * 아래의 요소들은 RPS/RFS를 위해 CPU들 간에 접근이 가능하다
     *
     * call_single_data_t
     * IPI와 관련. CPU간 정보를 전송하기 위해 IPI에서 사용
     * ref. https://github.com/torvalds/linux/commit/966a967116e699762dbf4af7f9e0d1955c25aa37
     */
    call_single_data_t    csd ____cacheline_aligned_in_smp;
    struct softnet_data    *rps_ipi_next;    // rps_ipi_list에서 다음 노드를 저장하는 것인가..?
    unsigned int        cpu;
    unsigned int        input_queue_tail;    // input_queue_head가 있으면 input_queue_tail도 존재...?
#endif
    unsigned int        dropped;
    struct sk_buff_head    input_pkt_queue;
    struct napi_struct    backlog;

};

...

3. tun.c

/* Since the socket were moved to tun_file, to preserve the behavior of persist
 * device, socket filter, sndbuf and vnet header size were restore when the
 * file were attached to a persist device.
 *
 * Socket들이 tun_file로 이동되었기 때문에, persist device의 동작을 유지하기 위해
 * socekt filter, sndbuf, vnet header size가 file이 persist device에 붙었을때 복구된다
 * > persist device가 뭘까요?
 */
struct tun_struct {
    struct tun_file __rcu    *tfiles[MAX_TAP_QUEUES];
    unsigned int            numqueues;
    unsigned int         flags;
    ...
}

...

/* Net device start xmit */
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
    /* static_key_false(struct static_key *key)
     * static_key를 parameter로 받아서,
     * static_key_count(static_key)가 0보다 크면 true, 아닌 경우에는 false로 반환한다
     *
     * static key
     * 특정 조건에서만 (가령 모니터링을 켰을 때만) 실행하는 루틴으로 향하는
     * 브랜치 인스트럭션을 런타임에 바꿔치기하는 것 -> branch miss 확률을 약간 줄여준다!
     * https://wariua.github.io/facility/labels-and-static-key.html
     * http://jake.dothome.co.kr/static-keys/
     *
     * ref. https://elixir.bootlin.com/linux/latest/source/Documentation/static-keys.txt
     * kor. https://wariua.cafe24.com/wiki/Documentation/static-keys.txt
     */
    if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
        /* Select queue was not called for the skbuff, so we extract the
         * RPS hash and save it into the flow_table here.
         *
         * skbuff에 대해 select queue가 호출되지 않았기 때문에
         * RPS 해시를 추출해서 여기에 있느 flow_table에 저장한다
         * > select queue란..?
         */
        __u32 rxhash;
        rxhash = __skb_get_hash_symmetric(skb);    // flow key를 갖고, flow hash값을 가져온다

        // rxhash가 존재한다면..
        if (rxhash) {
            struct tun_flow_entry *e;

            /* static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
             * list를 탐색하여 rxhash값을 가진 entry를 찾아낸다
             */
            e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);

            // entry를 찾아냈다면..
            if (e)
                tun_flow_save_rps_rxhash(e, rxhash);
            /* entry->rps_rxhash와 rxhash를 비교하여 다르다면, 
             * entry->rps_rxhash = rxhash로 업데이트해준다
             *
             * stack receive path(수신경로)에 수신된 hash를 저장하고,
             * 이에 따라flow_hash table을 업데이트 한다
             *
             * Save the hash received in the stack receive path and update the
             * flow_hash table accordingly.
             */
        }
    }
#endif
}

4. sysctl_net_core.c

...

// RFS 패치와 함께 생긴 코드!
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos)
{
    unsigned int orig_size, size;
    int ret, i;
    struct ctl_table tmp = {
        .data = &size,
        .maxlen = sizeof(size),
        .mode = table->mode
    };

    /*
     * rps_sock_flow_table이니까.. 전역 hash table이라고 볼 수 있고,
     * 각 flow들에 대해서 원하는 CPU(=desired CPU) 정보가 담긴다..?
     */
    struct rps_sock_flow_table *orig_sock_table, *sock_table;
    static DEFINE_MUTEX(sock_flow_mutex);

    mutex_lock(&sock_flow_mutex);

    orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
                    lockdep_is_held(&sock_flow_mutex));
    size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;

    ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);    // proc_dointvec():  read a vector of integers

    if (write) {
        if (size) {
            if (size > 1<<29) {
                /* Enforce limit to prevent overflow
                 * overflow 방지
                 */
                mutex_unlock(&sock_flow_mutex);
                return -EINVAL;
            }

            size = roundup_pow_of_two(size);    // rps_sock_flow_table의 크기

            if (size != orig_size) {
                // sock_table(rps_sock_flow_table) 메모리 할당
                sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));

                if (!sock_table) {
                    // 메모리 할당 실패
                    mutex_unlock(&sock_flow_mutex);
                    return -ENOMEM;
                }

                /* rps_cpu_mask 계산 (scaling.txt - RFS - Suggested Config 참고)
		 * rps_cpu_mask는 공간을 나누는 데 사용되며, 사용가능한 CPU 개수에 의존적이다
                 */
                rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
                sock_table->mask = size - 1;
            } else
                sock_table = orig_sock_table;

            // sock_table의 entry 초기화
            for (i = 0; i < size; i++)
                sock_table->ents[i] = RPS_NO_CPU;
        } else
            sock_table = NULL;

        // sock_table과 orig_sock_table이 같지 않다면..
        if (sock_table != orig_sock_table) {
            rcu_assign_pointer(rps_sock_flow_table, sock_table);

            // sock table이 존재한다면..
            if (sock_table) {
                static_key_slow_inc(&rps_needed);
                static_key_slow_inc(&rfs_needed);
            }

            // orig_sock_table이 존재한다면..
            if (orig_sock_table) {
                static_key_slow_dec(&rps_needed);
                static_key_slow_dec(&rfs_needed);
                synchronize_rcu();
                vfree(orig_sock_table);
            }

            // > 이 부분은 orig_sock_table과 sock_table의 일종의 sync를 맞춰주는거라고 보면 될까요?
        }
    }

    mutex_unlock(&sock_flow_mutex);

    return ret;
}

...

5. net-sysfs.c

...

static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
{
    struct rps_map *map;
    cpumask_var_t mask;
    int i, len;

    // mask에 메모리 할당
    if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
        return -ENOMEM;    // 할당 실패

    rcu_read_lock();
    map = rcu_dereference(queue->rps_map);    // queue의 rps map을 가져온다
    // map이 존재한다면, map의 모든 cpus의 값을 가져와서 mask에 설정
    if (map)
        for (i = 0; i < map->len; i++)
            cpumask_set_cpu(map->cpus[i], mask);    // cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)

    // cpu mask 값의 길이를 가져온다
    len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));    // cpumask_pr_args(): printf args to output a cpumask
    rcu_read_unlock();

    free_cpumask_var(mask);    // 할당 해제

    /* cpu mask 값의 길이가 PAGE_SIZE보다 작은 경우 = 정상 -> cpu mask 값의 길이 반환 
     *
     * > 여기서는 rps map의 가장 마지막에 저장된 mask 값을 가져오는건가? for문 한바퀴 돌았으니..
     * > 이 함수의 실질적인 역할은.. rx queue의 rps map에 저장된 cpu mask 값의 길이를 가져오는 것이라 할 수 있나..?
     */
    return len < PAGE_SIZE ? len : -EINVAL;
}

// 정확히 이 함수가 무엇을 하는지 파악하기가 어렵네요
static ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len)
{
    struct rps_map *old_map, *map;    // 이전 map과 새로운 map..?
    cpumask_var_t mask;
    int err, cpu, i;
    static DEFINE_MUTEX(rps_map_mutex);


    /* capable()
     * 현재 작업(task)이 더 우수한 성능을 발휘하는지 확인
     * 현재 작업이 지정된 능력이있는 경우는 true를 반환
     *
     * 현재 작업이 지정된 능력이 없다면..
     */
    if (!capable(CAP_NET_ADMIN))
        return -EPERM;

    if (!alloc_cpumask_var(&mask, GFP_KERNEL))
        return -ENOMEM;

    err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);    // convert an ASCII hex string into a bitmap (ASCII Hex -> bitmap 변환)
    // 에러가 발생한다면..
    if (err) {
        free_cpumask_var(mask);    // cpumask 해제
        return err;
    }

    // map에 메모리 할당
    map = kzalloc(
        max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
        GFP_KERNEL
    );

    // map에 메모리가 제대로 할당되지 않았다면..
    if (!map) {
        free_cpumask_var(mask);
        return -ENOMEM;
    }

    i = 0;
    // allocate memory for queue storage
    for_each_cpu_and(cpu, mask, cpu_online_mask)
        map->cpus[i++] = cpu;    // cpu는 초기화된 값도 아닌데... 뭘까요?

    if (i) {
        // 정상인 경우...
        map->len = i;
    } else {
        // 비정상인 경우...
        kfree(map);
        map = NULL;
    }

    mutex_lock(&rps_map_mutex);

    // old map 불러오기
    old_map = rcu_dereference_protected(queue->rps_map,
                        mutex_is_locked(&rps_map_mutex));
    rcu_assign_pointer(queue->rps_map, map);    // rx_queue에 있는 rps map을 map에 복사 (map을 store하는 부분..?)

    // map이 존재한다면..
    if (map)
        static_key_slow_inc(&rps_needed);
    // old map이 존재한다면..
    if (old_map)
        static_key_slow_dec(&rps_needed);

    mutex_unlock(&rps_map_mutex);

    // old map이 존재한다면..
    if (old_map)
        kfree_rcu(old_map, rcu);    // old map 할당 해제(제거)

    free_cpumask_var(mask);
    return len;
}

// rps_flow_table->mask + 1 => rps_dev_flow_table_count라고 보면 될까요? (count니까 1 증가시킨다는 의미인가..?)
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
                       char *buf)
{
    struct rps_dev_flow_table *flow_table;
    unsigned long val = 0;

    rcu_read_lock();
    flow_table = rcu_dereference(queue->rps_flow_table);    // rx queue에서 rps_flow_table을 가져온다
    // flow_table이 존재한다면..
    if (flow_table)
        val = (unsigned long)flow_table->mask + 1;    // flow_table의 mask에 +1 한 값을 val로 저장한다 (왜 +1일까요..?)
    rcu_read_unlock();

    return sprintf(buf, "%lu\n", val);    // buf에 val 값을 할당해서 반환한다
}

// rps_dev_flow_table의 할당 해제
static void rps_dev_flow_table_release(struct rcu_head *rcu)
{
    struct rps_dev_flow_table *table = container_of(rcu,
        struct rps_dev_flow_table, rcu);
    vfree(table);    // 할당 해제
}

static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len)
{
    unsigned long mask, count;
    struct rps_dev_flow_table *table, *old_table;    // 새로운 table..? 이전 table..
    static DEFINE_SPINLOCK(rps_dev_flow_lock);
    int rc;

    // 현재 작업이 지정된 능력이 없다면..
    if (!capable(CAP_NET_ADMIN))
        return -EPERM;

    // kstrtoul(string, 0 (8진법 사용), unsigned long 변환 결과)
    rc = kstrtoul(buf, 0, &count);    // convert a string to an unsigned long (str -> u long 변환)
    // rc 값이 이상하다면.. (kstrtoul(): Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error)
    if (rc < 0)
        return rc;

    // count가 0이 아닌 수라면..
    if (count) {
        mask = count - 1;
        /* mask = roundup_pow_of_two(count) - 1;
         * without overflows...
         */
        while ((mask | (mask >> 1)) != mask)
            mask |= (mask >> 1);    // 어떤 역할을 하는 반복문일까요? 무슨 의미일까.... mask 값을 무엇인가 조정하는 거 같은데..

        /* On 64 bit arches, must check mask fits in table->mask (u32),
         * and on 32bit arches, must check
         * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
         */
#if BITS_PER_LONG > 32
        // 64bit architecture - mask가 table->mask (u32)값과 맞는지 확인 (맞지 않으면 오류)
        if (mask > (unsigned long)(u32)mask)
            return -EINVAL;
#else
        // 32bit architecture - RPS_DEV_FLOW_TABLE_SIZE(mask + 1)가 overflow하지 않는지 확인 (overflow하면 오류)
        if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
                / sizeof(struct rps_dev_flow)) {
            /* Enforce a limit to prevent overflow */
            return -EINVAL;
        }
#endif
        table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));    // table에 메모리 할당
        // table에 메모리 할당이 정상적으로 되지 않은 경우...
        if (!table)
            return -ENOMEM;

        table->mask = mask;    // table->mask에 mask값 대입
        // mask값만큼 각 테이블의 flow에 존재하는 cpu 초기화 (flows[0]~flows[mask])
        for (count = 0; count <= mask; count++)
            table->flows[count].cpu = RPS_NO_CPU;
    } else {
        // count가 0이라면..
        table = NULL;
    }

    spin_lock(&rps_dev_flow_lock);
    // 이전 table 값 불러오기
    old_table = rcu_dereference_protected(queue->rps_flow_table,
                          lockdep_is_held(&rps_dev_flow_lock));
    rcu_assign_pointer(queue->rps_flow_table, table);    // rx queue에 있는 rps flow table을 table에 복사
    spin_unlock(&rps_dev_flow_lock);

    // old table이 있다면..
    if (old_table)
        call_rcu(&old_table->rcu, rps_dev_flow_table_release);    // old table 할당 해제

    return len;
}

// rps_cpus에 관한 속성(attribute)..? show/store rps_map!
static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
    = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);

// rps_dev_flow_table_cnt_attribute에 관한 속성(attribute)..? show/store rps_dev_flow_table_cnt!
static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
    = __ATTR(rps_flow_cnt, 0644,
         show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);

...

/* rx queue에 대한 기본 속성..?
 * CONFIG_RPS가 설정되어있으면, rps_cpus, rps_dev_flow_table_cnt관련 속성을 설정해주는 거 같다..?
 * 설정되어있지 않다면 그냥 NULL로 설정!
 */
static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
#ifdef CONFIG_RPS
    &rps_cpus_attribute.attr,
    &rps_dev_flow_table_cnt_attribute.attr,
#endif
    NULL
};

...

/* rps_map, rps_dev_flow_table, kobject, rx_queue의 device 모두를 할당 해제한다 */
static void rx_queue_release(struct kobject *kobj)
{
    struct netdev_rx_queue *queue = to_rx_queue(kobj);
#ifdef CONFIG_RPS
    struct rps_map *map;
    struct rps_dev_flow_table *flow_table;

    map = rcu_dereference_protected(queue->rps_map, 1);    // rx queue에서 rps_map을 가져온다
    // map이 존재한다면..
    if (map) {
        RCU_INIT_POINTER(queue->rps_map, NULL);    // rps_map의 포인터를 NULL로 설정한다
        kfree_rcu(map, rcu);    // map의 할당 해제
    }

    flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);   // rx queue에서 rps_flow_table을 가져온다
    // flow table이 존재한다면..
    if (flow_table) {
        RCU_INIT_POINTER(queue->rps_flow_table, NULL);    // rps_flow_table의 포인터를 NULL로 설정한다
        call_rcu(&flow_table->rcu, rps_dev_flow_table_release);    // rps_dev_flow_table_release()를 호출하여 별도의 할당 해제 과정을 거친다
    }
#endif

    memset(kobj, 0, sizeof(*kobj));    // kobject를 0으로 초기화한다
    dev_put(queue->dev);    // release reference to device -> device에 대한 ref를 해제한다
}

...

6. dev.c

...

// struct softnet_data: Incoming packets are placed on per-CPU queues
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
    /* struct sk_buff_head input_pkt_queue
     *     -> spinlock_t lock
     */
    spin_lock(&sd->input_pkt_queue.lock);
#endif
}

static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
    spin_unlock(&sd->input_pkt_queue.lock);
#endif
}

...

/* One global table that all flow-based protocols share.
 *
 * 모든 flow 기반 프로토콜들을 공유하는 하나의 전역 테이블
 */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);

struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu)
{
    /* nr_cpu_ids: max usable cpus -> 실제 운영가능한 최고 CPU 번호
     *
     * 다음 CPU id(?)가 nr_cpu_ids보다 작다면..
     */
    if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
        struct netdev_rx_queue *rxqueue;
        struct rps_dev_flow_table *flow_table;
        struct rps_dev_flow *old_rflow;
        u32 flow_id;
        u16 rxq_index;
        int rc;

        /* Should we steer this flow to a different hardware queue? */
        if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
            !(dev->features & NETIF_F_NTUPLE))
            goto out;
        rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
        if (rxq_index == skb_get_rx_queue(skb))
            goto out;

        rxqueue = dev->_rx + rxq_index;
        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        if (!flow_table)
            goto out;
        flow_id = skb_get_hash(skb) & flow_table->mask;
        rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
                            rxq_index, flow_id);
        if (rc < 0)
            goto out;
        old_rflow = rflow;
        rflow = &flow_table->flows[flow_id];
        rflow->filter = rc;
        if (old_rflow->filter == rflow->filter)
            old_rflow->filter = RPS_NO_FILTER;
    out:
#endif
        // 도무지 이해가 되지 않는 부분..
        rflow->last_qtail = per_cpu(softnet_data, next_cpu).input_queue_head;
    }

    // rflow의 cpu를 next cpu로 갱신
    rflow->cpu = next_cpu;
    return rflow;
}

/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 *
 * get_rps_cpu는 netif_receive_skb로부터 호출되며,
 * 주어진 skb의 rx queue에 있는 RPS map으로부터 target CPU를 반환한다
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
               struct rps_dev_flow **rflowp)
{
    const struct rps_sock_flow_table *sock_flow_table;    // 전역 테이블의 자료형과 같다
    struct netdev_rx_queue *rxqueue = dev->_rx;    // 주어진 skb의 rx queue 할당
    struct rps_dev_flow_table *flow_table;
    struct rps_map *map;
    int cpu = -1;
    u32 tcpu;
    u32 hash;

    /* skb_rx_queue_recorded(): skb의 queue mapping 여부 반환(?) -> 0이면 mapping X
     *     => return skb->queue_mapping != 0
     */
    if (skb_rx_queue_recorded(skb)) {
        /* skb_get_rx_queue(): queue mapping - 1이 곧 index..?
         *    => return skb->queue_mapping - 1
         */
        u16 index = skb_get_rx_queue(skb);

        // real_num_rx_queues: Number of RX queues currently active in device -> device에서 현재 활성화 상태인 rx queue의 수
        // index > dev->real_num_rx_queues이면 안되는 것 같다..
        if (unlikely(index >= dev->real_num_rx_queues)) {
            WARN_ONCE(dev->real_num_rx_queues > 1,
                  "%s received packet on queue %u, but number "
                  "of RX queues is %u\n",
                  dev->name, index, dev->real_num_rx_queues);
            goto done;
        }
        rxqueue += index;
    }

    /* Avoid computing hash if RFS/RPS is not active for this rxqueue
     *
     * RPS/RFS가 rxqueue에서 활성화되지 않은 경우, hash 계산을 피한다
     */

    flow_table = rcu_dereference(rxqueue->rps_flow_table);    // rx queue에서 rps flow table 불러오기
    map = rcu_dereference(rxqueue->rps_map);    // rx queue에서 rps map 불러오기

    // flow table과 map이 모두 존재하지 않는다면 문제인 듯 하다...
    if (!flow_table && !map)
        goto done;

    skb_reset_network_header(skb);    // skb->network_header = skb->data - skb->head 수행..

    /* skb에 저장된 packet hash 반환
     * 단, skb에 l4_hash와 skb->sw_hash가 존재하지 않는다면 flow hash를 계산하여 skb에 새로이 packet hash 설정!
     */
    hash = skb_get_hash(skb);

    // hash가 존재하지 않으면 문제인 듯...
    if (!hash)
        goto done;

    sock_flow_table = rcu_dereference(rps_sock_flow_table);    // 전역 rps sock flow table을 불러온다
	// flow table(from rx queue->rps_flow_table)과 sock flow table이 존재하면..
    if (flow_table && sock_flow_table) {
        struct rps_dev_flow *rflow;
        u32 next_cpu;
        u32 ident;

        /* First check into global flow table if there is a match
		 *
		 * 우선 유효성 여부를 확인하기 위해 global flow table 먼저 확인
		 */
        ident = sock_flow_table->ents[hash & sock_flow_table->mask];
		// entry에 유효한 CPU가 들어있지 않다면 (* 각 entry에는 해당 flow를 최근에 처리한 CPU에 대한 항목이 저장됨)
        if ((ident ^ hash) & ~rps_cpu_mask)
            goto try_rps;    // RFS 대신 RPS 사용
        next_cpu = ident & rps_cpu_mask;    // next cpu의 id를 이런 방식으로 가져온다....!

        /* OK, now we know there is a match,
         * we can look at the local (per receive queue) flow table
		 *
		 * entry에 유효한 CPU가 들어있는 것을 확인했으니,
		 * 각 rx queue에 있는 local flow table을 확인한다
         */
        rflow = &flow_table->flows[hash & flow_table->mask];
        tcpu = rflow->cpu;    // tcpu = target cpu? temp cpu? 아무튼, current CPU 값을 가지게 되는 것은 확실하다

        /*
         * If the desired CPU (where last recvmsg was done) is
         * different from current CPU (one in the rx-queue flow
         * table entry), switch if one of the following holds:
         *   - Current CPU is unset (>= nr_cpu_ids).
         *   - Current CPU is offline.
         *   - The current CPU's queue tail has advanced beyond the
         *     last packet that was enqueued using this table entry.
         *     This guarantees that all previous packets for the flow
         *     have been dequeued, thus preserving in order delivery.
		 *
		 * rps sock flow table의 desired CPU와 rps dev flow table의 current CPU가 다른 경우,
		 * 아래 if문에 들어있는 세가지 OR 조건 중 하나라도 참이면, current CPU = desired CPU로 갱신 (current CPU가 desired CPU와 같아진다)
		 *
		 *
		 * (int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0): 현재 CPU의 큐 head 카운터 >= rps_dev_flow[i]에 기록된 tail 카운터 값
		 * tcpu >= nr_cpu_ids: 현재 CPU가 설정되어 있지 않음 (>= nr_cpu_ids)
		 * !cpu_online(tcpu): 현재 CPU가 오프라인 상태
         */
        if (unlikely(tcpu != next_cpu) &&
            (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
             ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
              rflow->last_qtail)) >= 0)) {
            tcpu = next_cpu;
            rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
        }

		// 현재 CPU가 설정되어있고, CPU가 온라인 상태라면..
        if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
			// rflow와 cpu를 업데이트 해준다
            *rflowp = rflow;
            cpu = tcpu;
            goto done;
        }
    }

try_rps:

    if (map) {
        tcpu = map->cpus[reciprocal_scale(hash, map->len)];
        if (cpu_online(tcpu)) {
            cpu = tcpu;
            goto done;
        }
    }

done:
    return cpu;
}

#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed -> RFS H/W filter가 삭제될 예정인지 아닌지 확인
 * @dev: Device on which the filter was set -> filter가 설정된 device
 * @rxq_index: RX queue index -> rx queue의 index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer() -> ndo_rx_flow_steer()를 통과한 flow ID
 * @filter_id: Filter ID returned by ndo_rx_flow_steer() -> ndo_rx_flow_steer()에 의해 반환된 filter ID
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id)
{
    struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
    struct rps_dev_flow_table *flow_table;
    struct rps_dev_flow *rflow;
    bool expire = true;    // 기본적으로 expire를 예상하는..
    unsigned int cpu;

    rcu_read_lock();
    flow_table = rcu_dereference(rxqueue->rps_flow_table);    // receive queue에서 rps flow table을 가져온다 -> flow_table에 저장

    // flow_table이 정상적으로 존재하고, flow_id가 flow_table의 mask보다 작은 값을 가진다면..
    if (flow_table && flow_id <= flow_table->mask) {
        // flow_id는 flow_table에 저장된 flow들 중에서 특정 flow를 가져올 수 있는 index역할을 한다
        rflow = &flow_table->flows[flow_id];    // flow table에서 flow를 가져온다
        cpu = READ_ONCE(rflow->cpu);    // 어떤 cpu에 매핑된 flow인지 알아내기 위함

        /*
         * flow를 expire시키지 않을 조건
         *
         * flow의 filter가 filter_id와 같고 (아마 여기서 filter라는 것은, 조건에 맞는 것들은 expire시키지 않기 위해서 존재하는 듯)
         * cpu가 nr_cpu_ids보다 작고
         * cpu에 저장된 softnet_data의 input_queue_head에서 flow의 마지막 queue에 저장된 값을 뺀 것이 flow table mask에 10을 곱한 것보다 작은 경우..
         */
        if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
            ((int)(per_cpu(softnet_data, cpu).input_queue_head -
               rflow->last_qtail) <
             (int)(10 * flow_table->mask)))
            expire = false;
    }
    rcu_read_unlock();
    return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

/* Called from hardirq (IPI) context
 * HardIRQ에 의해 호출되는 부분 -> rps를 trigger?!
 */
static void rps_trigger_softirq(void *data)
{
    struct softnet_data *sd = data;

    ____napi_schedule(sd, &sd->backlog);
    sd->received_rps++;
}

#endif /* CONFIG_RPS */

...

/*
 * Check if this softnet_data structure is another cpu one
 * If yes, queue it to our IPI list and return 1
 * If no, return 0
 *
 * softnet_data 구조가 다른 CPU에도 있는지 확인(?) / 다른 CPU 구조인지 확인(?)
 * 만약 있다면, IPI list에 queueing하고, 1을 반환한다
 * 아니라면 그냥 0을 반환한다
 *
 * 여러 CPU에서 데이터를 처리하기 때문에 이런 부분이 있는 것 같다
 */
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
    struct softnet_data *mysd = this_cpu_ptr(&softnet_data);

    // 다른 cpu에도 해당 softnet data가 존재하는 경우라면...(?)
    if (sd != mysd) {
        // IPI list에 해당 softnet data 저장
        sd->rps_ipi_next = mysd->rps_ipi_list;
        mysd->rps_ipi_list = sd;

        __raise_softirq_irqoff(NET_RX_SOFTIRQ);    // IRQ disable
        return 1;
    }
#endif /* CONFIG_RPS */
    return 0;
}

...

static int netif_rx_internal(struct sk_buff *skb)
{
    int ret;

    net_timestamp_check(netdev_tstamp_prequeue, skb);

    trace_netif_rx(skb);

    if (static_branch_unlikely(&generic_xdp_needed_key)) {
        int ret;

        preempt_disable();
        rcu_read_lock();
        ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
        rcu_read_unlock();
        preempt_enable();

        /* Consider XDP consuming the packet a success from
         * the netdev point of view we do not want to count
         * this as an error.
         */
        if (ret != XDP_PASS)
            return NET_RX_SUCCESS;
    }

#ifdef CONFIG_RPS
    if (static_key_false(&rps_needed)) {
        struct rps_dev_flow voidflow, *rflow = &voidflow;
        int cpu;

        preempt_disable();
        rcu_read_lock();

        cpu = get_rps_cpu(skb->dev, skb, &rflow);
        if (cpu < 0)
            cpu = smp_processor_id();

        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

        rcu_read_unlock();
        preempt_enable();
    } else
#endif
    {
        unsigned int qtail;

        ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
        put_cpu();
    }
    return ret;
}

...

static int netif_receive_skb_internal(struct sk_buff *skb)
{
    int ret;

    net_timestamp_check(netdev_tstamp_prequeue, skb);

    if (skb_defer_rx_timestamp(skb))
        return NET_RX_SUCCESS;

    if (static_branch_unlikely(&generic_xdp_needed_key)) {
        int ret;

        preempt_disable();
        rcu_read_lock();
        ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
        rcu_read_unlock();
        preempt_enable();

        if (ret != XDP_PASS)
            return NET_RX_DROP;
    }

    rcu_read_lock();
#ifdef CONFIG_RPS
    if (static_key_false(&rps_needed)) {
        struct rps_dev_flow voidflow, *rflow = &voidflow;
        int cpu = get_rps_cpu(skb->dev, skb, &rflow);

        if (cpu >= 0) {
            ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
            rcu_read_unlock();
            return ret;
        }
    }
#endif
    ret = __netif_receive_skb(skb);
    rcu_read_unlock();
    return ret;
}

...

static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
    // softnet data iteration (여기서 remsd는 IPI list)
    while (remsd) {
        struct softnet_data *next = remsd->rps_ipi_next;    // 다음 IPI값을 가져온다

        // IPI에 대한 CPU가 활성화되어있다면..
        if (cpu_online(remsd->cpu))
            smp_call_function_single_async(remsd->cpu, &remsd->csd);
        remsd = next;    // do next iteration..
    }
#endif
}

/*
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 * Note: called with local irq disabled, but exits with local irq enabled.
 *
 * net_rps_action_and_irq_enable은 rps를 위해 보류중인 IPI를 전송한다
 * local irq가 비활성화 된 상태에서 호출하면, local irq가 활성화된 상태로 변경하고 종료한다
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
    struct softnet_data *remsd = sd->rps_ipi_list;    // IPI 정보를 가져온다

    // IPI 정보가 정상적으로 존재한다면.. (+ local irq가 활성화되어있다면)
    if (remsd) {
        sd->rps_ipi_list = NULL;    // 기존의 IPI정보는 NULL로 초기화하고

        local_irq_enable();    // local irq를 활성화시킨다

        /* Send pending IPI's to kick RPS processing on remote cpus.
         * remote cpu에서 RPS 처리를 시작하기 위해 보류중인 IPI를 전송한다
         */
        net_rps_send_ipi(remsd);
    } else
#endif
        local_irq_enable();    // (local irq가 비활성화된 상태) local irq를 활성화시킨다
}

// softnet data가 rps 처리를 위해 IPI 대기를 하고 있는지 확인하는 듯?
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
    /*
     * IPI리스트에 IPI가 존재한다면 waiting 중인 것!
     * IPI는 즉, RPS 처리를 위해 존재한다 -> 처리할 RPS가 없다면 IPI도 없다?
     */
    return sd->rps_ipi_list != NULL;
#else
    return false;
#endif
}

...

static int dev_cpu_dead(unsigned int oldcpu)
{
    struct sk_buff **list_skb;
    struct sk_buff *skb;
    unsigned int cpu;
    struct softnet_data *sd, *oldsd, *remsd = NULL;

    local_irq_disable();
    cpu = smp_processor_id();
    sd = &per_cpu(softnet_data, cpu);
    oldsd = &per_cpu(softnet_data, oldcpu);

    /* Find end of our completion_queue. */
    list_skb = &sd->completion_queue;
    while (*list_skb)
        list_skb = &(*list_skb)->next;
    /* Append completion queue from offline CPU. */
    *list_skb = oldsd->completion_queue;
    oldsd->completion_queue = NULL;

    /* Append output queue from offline CPU. */
    if (oldsd->output_queue) {
        *sd->output_queue_tailp = oldsd->output_queue;
        sd->output_queue_tailp = oldsd->output_queue_tailp;
        oldsd->output_queue = NULL;
        oldsd->output_queue_tailp = &oldsd->output_queue;
    }
    /* Append NAPI poll list from offline CPU, with one exception :
     * process_backlog() must be called by cpu owning percpu backlog.
     * We properly handle process_queue & input_pkt_queue later.
     */
    while (!list_empty(&oldsd->poll_list)) {
        struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
                                struct napi_struct,
                                poll_list);

        list_del_init(&napi->poll_list);
        if (napi->poll == process_backlog)
            napi->state = 0;
        else
            ____napi_schedule(sd, napi);
    }

    raise_softirq_irqoff(NET_TX_SOFTIRQ);
    local_irq_enable();

#ifdef CONFIG_RPS
    remsd = oldsd->rps_ipi_list;
    oldsd->rps_ipi_list = NULL;
#endif
    /* send out pending IPI's on offline CPU */
    net_rps_send_ipi(remsd);

    /* Process offline CPU's input_pkt_queue */
    while ((skb = __skb_dequeue(&oldsd->process_queue))) {
        netif_rx_ni(skb);
        input_queue_head_incr(oldsd);
    }
    while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
        netif_rx_ni(skb);
        input_queue_head_incr(oldsd);
    }

    return 0;
}

...

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
    int i, rc = -ENOMEM;

    BUG_ON(!dev_boot_phase);

    if (dev_proc_init())
        goto out;

    if (netdev_kobject_init())
        goto out;

    INIT_LIST_HEAD(&ptype_all);
    for (i = 0; i < PTYPE_HASH_SIZE; i++)
        INIT_LIST_HEAD(&ptype_base[i]);

    INIT_LIST_HEAD(&offload_base);

    if (register_pernet_subsys(&netdev_net_ops))
        goto out;

    /*
     *    Initialise the packet receive queues.
     */

    for_each_possible_cpu(i) {
        struct work_struct *flush = per_cpu_ptr(&flush_works, i);
        struct softnet_data *sd = &per_cpu(softnet_data, i);

        INIT_WORK(flush, flush_backlog);

        skb_queue_head_init(&sd->input_pkt_queue);
        skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
        skb_queue_head_init(&sd->xfrm_backlog);
#endif
        INIT_LIST_HEAD(&sd->poll_list);
        sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
        sd->csd.func = rps_trigger_softirq;
        sd->csd.info = sd;
        sd->cpu = i;
#endif

        sd->backlog.poll = process_backlog;
        sd->backlog.weight = weight_p;
    }

    dev_boot_phase = 0;

    /* The loopback device is special if any other network devices
     * is present in a network namespace the loopback device must
     * be present. Since we now dynamically allocate and free the
     * loopback device ensure this invariant is maintained by
     * keeping the loopback device as the first device on the
     * list of network devices.  Ensuring the loopback devices
     * is the first device that appears and the last network device
     * that disappears.
     */
    if (register_pernet_device(&loopback_net_ops))
        goto out;

    if (register_pernet_device(&default_device_ops))
        goto out;

    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
    open_softirq(NET_RX_SOFTIRQ, net_rx_action);

    rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
                       NULL, dev_cpu_dead);
    WARN_ON(rc < 0);
    rc = 0;
out:
    return rc;
}

subsys_initcall(net_dev_init);

7. af_inet.c

...

/* struct msghdr
 * 4.4BSD message passing 사용
 *
 * msg_name: ptr to socket address structure
 * msg_iter: data
 *
 * msg_control: ancillary data (보조 데이터)
 * msg_flags: 수신 메시지에 대한 flag
 * msg_iocb: ptr to iocb for async requests (iocb = I/O control block (structure))
 *    -> struct iov_iter자료형인데.. 많이 생소하네요
 */
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
    struct sock *sk = sock->sk;

    sock_rps_record_flow(sk);    // sock.h의 sock_rps_record_flow 참고 - RFS의 필요 여부 확인 후 작업진행 등..

    /* We may need to bind the socket.
     *
     * inet_sk(): inet_sock으로 형 변환
     *     inet_num: Local port
     *
     * sk_prot: protocol handlers inside a network family
     *     no_autobind: to avoid the autobind calls when the protocol is TCP.
     *                  Then sock_rps_record_flow() is called int the TCP's sendmsg() and sendpage() pathes.
     *
     * inet_autobind(): 성공시 0반환
     *
     * local port가 존재하지 않아야 하고, no_autobind가 false여야 하고(=autobind한다는 의미), socket을 자동으로 bind한 후 실패하면 에러...?
     */
    if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && inet_autobind(sk))
        return -EAGAIN;

    return sk->sk_prot->sendmsg(sk, msg, size);    // sock에 size길이의 msg 전송
}
EXPORT_SYMBOL(inet_sendmsg);

...

int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
         int flags)
{
    struct sock *sk = sock->sk;
    int addr_len = 0;
    int err;

    if (likely(!(flags & MSG_ERRQUEUE)))
        sock_rps_record_flow(sk);    // sock.h의 sock_rps_record_flow 참고 - RFS의 필요 여부 확인 후 작업진행 등..

    err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len);
    if (err >= 0)
        msg->msg_namelen = addr_len;
    return err;
}
EXPORT_SYMBOL(inet_recvmsg);

...

리눅스 네트워킹 스택에서의 확장성

출처

https://wariua.cafe24.com/wiki/Documentation/networking/scaling.txt

소개

이 문서는 다중 프로세서 시스템에서 병렬성을 높이고 성능을 향상시키기 위한 리눅스 네트워킹 스택에서의 보완적 기법들을 기술합니다.

다음 기술들을 설명합니다:

- RSS: Receive Side Scaling
- RPS: Receive Packet Steering
- RFS: Receive Flow Steering
- Accelerated Receive Flow Steering
- XPS: Transmit Packet Steering

RSS: Receive Side Scaling

요즘의 NIC들은 복수의 수신 및 출력 디스크립터 큐를 지원합니다 (다중 큐). 수신 시에 NIC는 각 패킷을 다른 큐로 보내서 CPU들 사이에 처리를 분산시킬 수 있습니다. NIC는 각 패킷에 필터를 적용해서 각 패킷을 소수의 논리적 흐름 중 하나로 할당합니다. 각 흐름에 대한 패킷을 별개의 수신 큐로 밀어 넣고, 그러면 이를 다시 별개의 CPU가 처리할 수 있습니다. 이러한 메커니즘을 일반적으로 "Receive-side Scaling"(RSS)이라고 합니다. RSS 및 여타 확장성 기법들의 목표는 성능을 균등하게 증가시키는 것입니다. 물론 다중 큐 분산은 트래픽 순위 조정에도 쓰일 수 있지만 이 기법들이 주로 관심을 두는 부분은 아닙니다.

RSS에서 쓰는 필터는 일반적으로 네트워크 및/또는 전송 계층 헤더에 대한 해시 함수입니다. 예를 들면 패킷의 IP 주소 및 TCP 포트에 대한 4튜플 해시입니다. 가장 흔한 RSS 하드웨어 구현체에서는 128항목 간접(indirection) 테이블을 사용하는데, 각 항목에 큐 번호가 저장되어 있습니다. 패킷에 대해 계산한 해시(일반적으로 Toeplitz 해시)의 하위 일곱 비트를 마스킹 하고, 그 숫자를 간접 테이블에 대한 키로 사용해서 대응하는 값을 얻어와서 패킷의 수신 큐를 결정합니다.

일부 고급 NIC들에서는 프로그램 가능한 필터에 기반해서 패킷들을 큐로 밀어 넣을 수 있습니다. 예를 들어, 웹 서버로 가는 TCP 포트 80 패킷을 별개의 수신 큐로 돌릴 수 있습니다. 그런 "n튜플" 필터는 ethtool로 설정할 수 있습니다 (--config-ntuple).

RSS 구성

다중 큐 지원 NIC에 대한 드라이버는 보통 구성할 하드웨어 큐의 개수를 지정할 수 있는 커널 모듈 매개변수를 제공합니다. 예를 들어 bnx2x 드라이버에서는 이 매개변수의 이름이 num_queues입니다. 전형적인 RSS 구성에서는 장치가 큐를 충분히 지원한다면 각 CPU마다 하나씩의 수신 큐가 있을 것이고, 그렇지 않다면 적어도 각 메모리 도메인마다 최소 하나씩이 있게 됩니다. 여기서 메모리 도메인이란 특정 메모리 수준(L1, L2, NUMA 노드 등)을 공유하는 CPU들의 집합을 말합니다.

해시 값 마스킹으로 큐를 결정하는 RSS 장치의 간접 테이블은 일반적으로 초기화 때에 드라이버가 프로그램 합니다. 기본 매핑은 테이블 내에서 큐들을 고르게 분산하는 것이되, ethtool 명령을 이용해서 (--show-rxfh-indir 및 --set-rxfh-indir) 런타임에 간접 테이블을 읽거나 변경할 수 있습니다. 간접 테이블 변경을 통해 각 큐에 서로 다른 상대적 가중치를 줄 수도 있습니다. RSS IRQ 구성

각 수신 큐는 별도의 연관 IRQ를 가지게 됩니다. NIC는 어떤 큐에 새 패킷이 도착했을 때 IRQ를 유발해서 CPU에게 알려줍니다. PCIe 장치의 신호 경로에서는 각 인터럽트의 경로를 특정 CPU로 지정할 수 있는 message signaled interrupts(MSI-X)를 사용합니다. 큐에서 IRQ로의 활성 매핑은 /proc/interrupts에서 알 수 있습니다. 기본적으로 한 IRQ는 어떤 CPU에서도 처리될 수 있습니다. 패킷 처리의 무시할 수 없는 부분이 수신 인터럽트 처리에서 이뤄지기 때문에 수신 인터럽트를 CPU들 사이에서 나누는 것이 유리합니다. 각 인터럽트의 IRQ 친연성(affinity)을 수동으로 조정하는 건 Documentation/IRQ-affinity.txt를 참고하세요. 어떤 시스템에는 irqbalance가 돌고 있을 텐데, 이 데몬은 IRQ 할당을 동적으로 최적화 하며, 그래서 수동 설정을 바꿔버릴 수 있습니다.

제안하는 구성

지연 시간이 염려되거나 수신 인터럽트 처리가 병목이 될 때 RSS를 활성화 해야 합니다. 부하를 CPU들 사이에서 분산하면 큐 길이가 줄어듭니다. 저지연(low latency) 네트워킹을 위한 최적의 설정은 시스템에 있는 CPU 수만큼 많은 (그게 안 되면 NIC 최대치로) 큐를 할당하는 것입니다. 가장 효율적인 고속 처리 구성은 CPU 포화로 수신 큐가 넘치지 않는 한에서 가장 적은 수신 큐를 만드는 것일 겁니다. 왜냐면 인터럽트 합병(coalescing)이 활성화 된 기본 동작 방식에서는 큐가 많아질수록 인터럽트의 총 개수가 (즉 할 일이) 늘어나기 때문입니다.

mpstat 유틸리티를 이용하면 CPU별 부하를 관찰할 수 있습니다. 단, 하이퍼스레딩(HT)이 되는 프로세서에서는 각 하이퍼스레드가 별개의 CPU로 표현되는 것에 유의해야 합니다. 초기 테스트들에서 HT는 인터럽트 처리에 있어서는 어떤 이득도 보이지 않았습니다. 따라서 큐의 개수를 시스템의 CPU 코어 개수로 제한하면 됩니다.

RPS: Receive Packet Steering

Receive Packet Steering(RPS)은 논리적으로 보면 RSS의 소프트웨어 구현입니다. 소프트웨어에 위치하기 때문에 필연적으로 데이터 경로에서 더 늦게 불리게 됩니다. RSS에서 큐를 선택하고 그래서 하드웨어 인터럽트 핸들러를 실행할 CPU를 선택하는 반면, RPS에서는 인터럽트 핸들러 상위의 프로토콜 처리를 수행할 CPU를 선택합니다. 원하는 CPU의 백로그에 패킷을 위치시키고 그 CPU를 깨워서 처리를 하게 만듭니다. RPS는 RSS와 비교할 때 몇 가지 장점이 있습니다. 1) 어떤 NIC와도 함께 사용할 수 있으며, 2) 새로운 프로토콜을 해시 할 수 있는 소프트웨어 필터를 쉽게 추가할 수 있으며, 3) 하드웨어 장치 인터럽트 발생을 높이지 않습니다 (다만 프로세서간 인터럽트(IPI)를 유발하기는 합니다).

RPS는 수신 인터럽트 핸들러의 바톰 하프에서, 좀 더 정확하게는 드라이버가 netif_rx()나 netif_receive_skb()로 패킷을 네트워크 스택에 보낸 때에 호출됩니다. 그러면 get_rps_cpu()를 호출해서 패킷을 처리할 큐를 선택합니다.

RPS에서 대상 CPU를 결정하는 첫 번째 단계는 패킷의 주소나 포트로 (프로토콜에 따라 2튜플 또는 4튜플 해시) 흐름 해시를 계산하는 것입니다. 이는 패킷의 연관 흐름 내내 일관적인 해시가 되어 줍니다. 해시를 하드웨어에서 제공할 수도 있는데, 그렇지 않으면 스택에서 계산하게 됩니다. 제공 가능한 하드웨어는 패킷의 수신 디스크립터 안에 해시를 넘겨줄 수 있습니다. 일반적으로는 RSS에서 쓰는 해시와 (예로, Toeplitz 해시) 같을 겁니다. 해시는 skb->rx_hash에 저장되며 패킷 흐름의 해시로서 스택 내 어디에서든 사용할 수 있습니다.

각 수신 하드웨어 큐에는 RPS에서 패킷을 큐에 넣어서 처리를 담당하게 할 수 있는 연관 CPU들의 목록이 있습니다. 수신된 각 패킷마다 흐름 해시를 목록 크기로 모듈러 연산 해서 목록에 대한 인덱스를 계산합니다. 인덱스가 가리키는 CPU가 패킷 처리를 담당할 대상이 되며, 패킷은 그 CPU의 백로그 큐 끝에 큐잉 됩니다. 그리고 바톰 하프 루틴 막바지에 백로그 큐에 패킷이 들어간 모든 CPU에게 IPI를 보냅니다. IPI는 원격 CPU에서 백로그 처리를 일으키고, 큐잉 된 패킷들이 네트워킹 스택으로 올라가서 처리됩니다.

RPS 구성

RPS를 쓰려면 커널이 CONFIG_RPS kconfig 심볼과 함께 컴파일 되어야 합니다 (SMP에서는 기본적으로 켜져 있음). 함께 컴파일이 되어 있어도 명시적으로 구성을 해주기 전까지는 RPS가 비활성화 되어 있습니다. sysfs 파일 항목을 이용해서 각 수신 큐에 대해 RPS가 트래픽을 전달해줄 수 있는 CPU 목록을 설정할 수 있습니다.

/sys/class/net/<dev>/queues/rx-<n>/rps_cpus

이 파일은 CPU의 비트맵을 구현하고 있습니다. 값이 0일 때(기본값)는 RPS가 비활성화 되고, 그 경우 패킷은 인터럽트를 받은 CPU에서 처리됩니다. Documentation/IRQ-affinity.txt에서 비트맵에 CPU를 할당하는 방법을 설명하고 있습니다. 제안하는 구성

단일 큐 장치의 경우 전형적인 RPS 구성은 인터럽트를 받은 CPU와 같은 메모리 도메인에 있는 CPU들로 rps_cpus를 설정하는 게 될 겁니다. NUMA 지역성이 문제가 되지 않는다면 시스템의 모든 CPU가 될 수도 있습니다. 인터럽트 발생 빈도가 높다면 인터럽트를 받은 CPU는 제외하는 게 현명할 수도 있습니다. 이미 많은 작업을 수행하고 있으니까요.

다중 큐 시스템의 경우 하드웨어 수신 큐가 각각의 CPU로 사상되도록 RSS를 구성한다면 이는 아마 불필요한 중복일 겁니다. 하드웨어 큐의 수가 CPU보다 적은 경우라면 RPS가 유익할 수 있습니다. 각 큐에 대한 rps_cpus를 그 큐에 대한 인터럽트를 받는 CPU와 같은 메모리 도메인을 공유하는 CPU들로 해주면 됩니다.

RPS 흐름 제한

RPS는 순서 바뀜을 유발하지 않으면서 커널 수신 처리를 여러 CPU로 확장해 줍니다. 동일한 흐름의 모든 패킷을 같은 CPU로 보내기 위해 감수해야 하는 문제는 흐름들마다 패킷 속도가 다른 경우의 CPU 부하 불균형입니다. 극단적인 경우, 한 흐름이 트래픽 상당 부분을 차지할 수 있습니다. 그런데 동시 연결이 많은 일반적인 서버 작업 부하에서는 특히, 그런 식의 동작은 잘못된 설정이나 출발 주소를 변조한 서비스 거부 공격 같은 문제를 드러내는 것입니다.

선택적인 RPS 기능인 흐름 제한(Flow Limit)은 작은 흐름의 패킷들에 앞서 큰 흐름들의 패킷들을 폐기해서 CPU 경쟁이 있을 때 작은 흐름에 우선 순위를 줍니다. 이 동작은 RPS 내지 RFS의 대상 CPU가 포화 상태에 가까울 때에만 동작합니다. CPU의 입력 패킷 큐 길이가 최대 큐 길이(sysctl net.core.netdev_max_backlog로 설정)의 절반을 넘으면 커널에서는 최근 256개 패킷에 대해 흐름별 패킷 개수를 세기 시작합니다. 새 패킷이 도착했을 때 어떤 흐름의 패킷이 지정된 비율(기본값은 절반)을 초과하면 새 패킷이 폐기됩니다. 다른 흐름의 패킷들은 여전히 입력 패킷 큐가 netdev_max_backlog에 도달했을 때만 폐기됩니다. 입력 패킷 큐 길이가 그 임계치 이하일 때에는 어떤 패킷도 폐기되지 않으므로 흐름 제한은 연결을 전면적으로 끊어버리지 않습니다. 큰 흐름들이라도 연결을 유지합니다.

인터페이스

흐름 제한은 기본적으로 컴파일에 포함되지만 (CONFIG_NET_FLOW_LIMIT) 기본적으로 켜지지는 않습니다. (락 및 캐시 경쟁을 피하기 위해) CPU별로 독립적으로 구현돼 있으며 sysctl net.core.flow_limit_cpu_bitmap의 해당 비트를 설정하여 CPU별로 켜거나 끕니다. procfs에서 쓸 때의 인터페이스는 rps_cpus와 같은 (앞의 설명 참고) CPU 비트맵입니다.

/proc/sys/net/core/flow_limit_cpu_bitmap

흐름별 속도는 각 패킷을 해시 테이블 버킷으로 해싱 한 다음 버킷별 카운터를 올리는 방식으로 계산합니다. 해시 함수는 RPS에서 CPU를 선택하는 것과 동일한 함수이지만 버킷 개수가 CPU 개수보다 훨씬 많으므로 흐름 제한은 더 적은 양성 오탐으로 큰 흐름들을 세밀하게 식별할 수 있습니다. 기본 테이블의 버킷은 4096개입니다. 이 값은 다음 sysctl로 변경할 수 있습니다.

net.core.flow_limit_table_len

이 값은 새 테이블을 할당할 때에만 참조합니다. 값을 변경해도 사용 중인 테이블을 바꾸지는 않습니다.

제안하는 구성

흐름 제한은 동시 연결이 많으면서 한 연결이 CPU 50%를 차지하는 게 문제 상황인 시스템에서 유용합니다. 그런 환경에서는 (/proc/irq/N/smp_affinity 설정에 따라) 네트워크 수신 인터럽트를 처리하는 모든 CPU에서 기능을 활성화 하면 됩니다.

이 기능은 입력 패킷 큐 길이가 흐름 제어 임계치(50%)를 초과하는 것과 최근 흐름 내역 길이(256)에 의존합니다. 실험에서는 net.core.netdev_max_backlog를 1000이나 10000으로 설정했을 때 잘 동작했습니다.

RFS: Receive Flow Steering

RPS는 해시만을 기반으로 패킷을 제어하기 때문에 일반적으로 훌륭한 부하 분산을 해주기는 하지만 응용 지역성을 고려하지는 않습니다. 이는 Receive Flow Steering(RFS)으로 이룰 수 있습니다. RFS의 목표는 패킷의 커널 처리를 패킷을 소비하는 응용 스레드가 돌고 있는 CPU에서 이뤄지도록 해서 데이터 캐시 히트율을 높이는 것입니다. ᅟRFS는 다른 CPU의 백로그로 패킷을 집어넣고 그 CPU를 깨우는 데 RPS에서와 같은 메커니즘을 사용합니다.

RFS에서는 패킷이 해시 값에 의해 바로 전달되지 않고, 해시를 흐름 검색 테이블에서의 인덱스로 사용합니다. 그 테이블은 흐름을 그 흐름이 처리되고 있는 CPU로 매핑 합니다. 테이블의 인덱스를 계산하는 데에는 흐름 해시(앞의 RPS 섹션 참고)가 쓰입니다. 각 항목에 기록된 CPU는 그 흐름을 최근에 처리한 CPU입니다. 그리고 항목에 유효한 CPU가 담겨있지 않으면 그 항목으로 사상된 패킷은 단순하게 RPS를 이용해 방향이 정해집니다. 여러 테이블 항목들이 같은 CPU를 가리킬 수 있습니다. 실제로, 흐름이 많고 CPU가 적은 경우 하나의 응용 스레드가 서로 다른 많은 흐름 해시들을 처리할 가능성이 아주 높습니다.

rps_sock_flow_table은 흐름별로 원하는 CPU, 즉 현재 사용자 공간에서 그 흐름을 처리하고 있는 CPU를 담고 있는 전역 흐름 테이블입니다. 각 테이블 값은 CPU 인덱스이며, recvmsg 및 sendmsg (더 구체적으로는 inet_recvmsg(), inet_sendmsg(), inet_sendpage(), tcp_splice_read()) 호출 과정에서 갱신됩니다.

스케줄러가 스레드를 새로운 CPU로 옮기는데 이전 CPU에 미처리 수신 패킷이 있을 때 패킷들이 순서가 바뀌어 도착할 수도 있습니다. 이를 피하기 위해 RFS에서는 각 흐름에 대한 미처리 패킷들을 추적하기 위한 두 번째 흐름 테이블을 사용합니다. rps_dev_flow_table은 각 장치의 각 하드웨어 수신 큐별로 있는 테이블입니다. 각 테이블 값에서 CPU 인덱스와 카운터를 저장합니다. CPU 인덱스는 이 흐름의 패킷들을 이후 커널 처리를 위해 큐잉 하는 현재 CPU를 나타냅니다. 이상적으로는 커널 및 사용자 공간 처리가 같은 CPU에서 일어나며 따라서 두 테이블의 CPU 인덱스가 동일합니다. 하지만 최근에 스케줄러가 사용자 공간 스레드를 이전했는데 여전히 이전 CPU에서의 커널 처리를 위해 큐잉 해둔 패킷들이 커널에 있으면 그렇게 되지 않을 가능성이 높습니다.

rps_dev_flow_table 값의 카운터는 이 흐름 내의 패킷이 마지막으로 큐잉 되었을 때 현재 CPU의 백로그 길이를 기록합니다. 각 백로그 큐에는 head 카운터가 있어서 큐에서 꺼낼 때 이를 증가시킵니다. 그리고 tail 카운터는 head 카운터 + 큐 길이로 계산합니다. 달리 말하면, rps_dev_flow[i]에 있는 카운터는 현재 흐름 i에 대해 지정된 CPU로 큐잉 되었던 흐름 i 내의 마지막 항목을 기록합니다 (물론 실제로 항목 i는 해시가 선정하며 여러 흐름들이 같은 항목 i로 해싱 될 수도 있습니다).

이제 패킷 순서 바뀜을 피하기 위한 기술이 나옵니다. (get_rps_cpu()에서) 패킷 처리를 위한 CPU를 선정할 때 rps_sock_flow 테이블과 패킷을 수신한 큐의 rps_dev_flow 테이블을 비교합니다. 그 흐름에 대한 (rps_sock_flow 테이블에서 찾은) 바람직한 CPU가 (rps_dev_flow 테이블에서 찾은) 현재 CPU와 일치하면 패킷을 그 CPU의 백로그로 큐잉 합니다. 만약 다르다면, 다음 중 하나가 참인 경우에 현재 CPU을 갱신하여 바람직한 CPU와 일치하게 만듭니다.

- 현재 CPU의 큐 head 카운터 >= rps_dev_flow[i]에 기록된 tail 카운터 값
- 현재 CPU가 설정되어 있지 않음 (>= nr_cpu_ids)
- 현재 CPU가 오프라인임

이 검사 후에 (갱신되었을 수도 있는) 현재 CPU로 패킷을 보냅니다. 이 규칙들의 목표는 이전 CPU에 미처리 패킷이 없을 때에만 흐름이 새로운 CPU로 이동하도록 보장하는 것인데, 그러지 않으면 새로운 CPU에서 처리하려 하는 패킷들보다 미처리 패킷이 늦게 도착할 수 있을 겁니다.

RFS 구성

RFS는 kconfig 심볼 CONFIG_RPS가 켜진 경우에만 (SMP에서 기본으로 켜져 있음) 사용 가능합니다. 명시적으로 구성해주기 전까지는 기능성이 꺼진 상태입니다. 다음을 통해 전역 흐름 테이블 내의 항목 개수를 설정합니다.

/proc/sys/net/core/rps_sock_flow_entries

다음을 통해 큐별 흐름 테이블 내의 항목 개수를 설정합니다.

/sys/class/net/<dev>/queues/rx-<n>/rps_flow_cnt

제안하는 구성

수신 큐에서 RFS를 켜기 전에 이 둘 모두를 설정할 필요가 있습니다. 두 값 모두를 가장 가까운 2의 제곱으로 올립니다. 제안하는 흐름 개수는 어떤 시점에서 활동적인 연결 개수 예상치에 따라 다른데, 그 수는 열린 연결 개수보다 상당히 작을 수도 있습니다. 중간 정도 부하가 있는 서버에서 rps_sock_flow_entries에 32768 값이 꽤 괜찮은 것을 확인했습니다.

단일 큐 장치에서는 보통 그 단일 큐에 대한 rps_flow_cnt 값을 rps_sock_flow_entries와 같은 값으로 설정하게 될 것입니다. 다중 큐 장치에서는 각 큐의 rps_flow_cnt를 rps_sock_flow_entries / N으로 (N은 큐 개수) 설정할 수 있을 겁니다. 따라서 가령 rps_sock_flow_entries이 32768로 설정되어 있고 16개의 수신 큐가 구성되어 있으면 각 큐에 대한 rps_flow_cnt를 2048로 구성할 수 있을 겁니다.

가속 RFS

가속(accelerated) RFS와 RFS의 관계는 RSS와 RPS의 관계와 같습니다. 즉, 각 흐름의 패킷들을 소모하는 응용 스레드가 어디서 돌고 있는지에 따라 흐름들의 방향을 결정하기 위해 소프트 상태를 이용하는 하드웨어 가속 부하 분산 메커니즘입니다. 가속 RFS는 RFS보다 더 성능이 나을 텐데, 데이터를 소모하는 스레드에 로컬인 CPU로 패킷을 바로 보내기 때문입니다. 대상 CPU는 응용이 도는 바로 그 CPU이거나, 아니면 적어도 캐시 위계에서 응용 스레드의 CPU에 로컬인 CPU입니다.

가속 RFS를 가능하게 하기 위해 네트워킹 스택에서는 ndo_rx_flow_steer 드라이버 함수를 호출하여 특정 흐름에 일치하는 패킷들에 바람직한 하드웨어 큐를 알립니다. 네트워크 스택에서는 rps_dev_flow_table 내의 흐름 항목이 갱신될 때마다 자동으로 이 함수를 호출합니다. 그러면 드라이버에서는 드라이버별로 고유한 방법을 사용하여 그 패킷들의 방향을 조종하도록 NIC를 프로그래밍 합니다.

흐름에 대한 하드웨어 큐는 rps_dev_flow_table에 기록된 CPU로 유도해냅니다. 스택에서 NIC 드라이버가 유지하는 하드웨어 큐 맵에 CPU를 확인해봅니다. 이는 /proc/interrupts에서 보이는 IRQ 친화성 테이블로 자동 생성한 역방향 맵입니다. cpu_rmap ("CPU affinity reverse map") 커널 라이브러리에 있는 함수들을 사용하여 드라이버가 그 맵을 채웁니다. 각 CPU에 대해 맵 내의 큐는 처리 CPU가 캐시 지역성 측면에서 가장 가까운 큐입니다.

가속 RFS 구성

가속 RFS는 커널을 CONFIG_RFS_ACCEL로 컴파일 한 경우에만 사용 가능하며 NIC 장치 및 드라이버에서 지원을 제공합니다. 또한 ethtool을 통해 ntuple 필터링을 켜주어야 합니다. CPU에서 큐로의 맵은 드라이버가 각 수신 큐에 대해 구성된 IRQ 친화성으로부터 끌어내므로 다른 추가 설정이 필요하지 않게 됩니다. 제안하는 구성

RFS를 사용하고 싶으며 NIC에서 하드웨어 가속을 지원하는 경우에는 항상 이 기법을 켜는 게 좋습니다.

XPS: Transmit Packet Steering

Transmit Packet Steering은 다중큐 장치로 패킷을 전송할 때 사용할 전송 큐를 지능적으로 선정하는 메커니즘입니다. 이를 위해 CPU에서 하드웨어 큐(들)로의 매핑을 기록합니다. 이 매핑의 목표는 일반적으로 CPU들의 어떤 부분집합에 배타적으로 큐들을 할당하는 것인데, 그 큐들에 대한 전송 완료를 그 집합 내의 CPU에서 처리합니다. 이러한 선택에는 두 가지 이득이 있습니다. 첫째로, 같은 큐에 대해 더 적은 CPU들이 경쟁하므로 장치 큐 락에 대한 경쟁이 상당히 줄어듭니다 (각 CPU가 각자의 전송 큐를 가지고 있으면 경쟁을 완전히 제거할 수 있습니다). 둘째로, 전송 완료시의 캐시 미스 비율이 감소하는데, 특히 sk_buff 구조체를 담은 데이터 캐시 라인에 대해서 그렇습니다.

XPS는 전송 큐별로 그 큐를 전송에 사용할 수 있는 CPU들의 비트맵을 설정하여 구성합니다. 각 네트워크 장치에 대해 CPU에서 전송 큐로 가는 역방향 매핑을 계산하여 유지합니다. 흐름 내의 첫 번째 패킷을 전송할 때 get_xps_queue() 함수를 호출하여 큐를 선택합니다. 이 함수는 실행 중인 CPU의 ID를 CPU에서 큐로의 검색 테이블에서 키로 사용합니다. ID가 큐 한 개에 일치하면 그 큐를 전송에 사용합니다. 여러 큐가 일치하면 흐름 해시를 이용해 그 집합에 대한 색인을 계산하여 하나를 선택합니다.

특정 흐름을 전송하는 데 선정한 큐를 흐름(가령, TCP 연결)에 대응하는 소켓 구조체에 저장합니다. 흐름 상에서 보내는 이후 패킷들에 이 전송 큐를 사용하여 순서 바뀜(out of order; ooo) 패킷을 방지합니다. 이 선정은 또한 get_xps_queues()의 비용을 흐름 내의 모든 패킷들로 분산(amortize)합니다. ooo 패킷을 막기 위해 흐름에 대한 큐는 흐름 내의 패킷에 대해 skb->ooo_okay가 설정된 경우에만 이후에 바뀔 수 있습니다. 이 플래그는 흐름 내에 미처리 패킷이 없음을 나타내며, 따라서 순서가 바뀐 패킷을 생성할 위험 없이 전송 큐를 바꿀 수 있습니다. ooo_okay를 적절히 설정하는 책임은 전송 계층에 있습니다. 예를 들어 TCP에서는 연결에 대한 모든 데이터가 확인응답 되었을 때 이 플래그를 설정합니다.

XPS 구성

XPS는 kconfig 심볼 CONFIG_XPS가 진 경우에만 (SMP에서 기본으로 켜져 있음) 사용 가능합니다. 명시적으로 구성하기 전까지는 기능성이 꺼진 상태로 있습니다. XPS를 켜려면 sysfs 파일 항목을 이용해 전송 큐를 사용할 수 있는 CPU들의 비트맵을 설정합니다.

/sys/class/net/<dev>/queues/tx-<n>/xps_cpus

제안하는 구성

전송 큐가 하나인 네트워크 장치에서는 XPS 구성이 효과가 없는데, 이 경우에는 선택의 여지가 없기 때문입니다. 다중큐 시스템에서는 각 CPU가 한 큐로 매핑 되도록 XPS를 구성하는 것이 바람직합니다. 시스템 내의 CPU 개수만큼 큐가 있다면 각 큐를 한 CPU로 매핑하여 경쟁이 없는 배타적 짝을 만들어줄 수 있습니다. CPU보다 큐의 개수가 적다면 큐에 대한 전송 완료(전송 인터럽트)를 처리하는 CPU와 캐시를 공유하는 CPU들이 그 큐를 공유하기에 가장 좋은 CPU일 것입니다.

추가 정보

RPS와 RFS는 커널 2.6.35에서 도입됐습니다. XPS는 2.6.38에서 추가됐습니다. Tom Herbert 씨([email protected])가 원안 패치들을 제출했습니다.

가속 RFS는 2.6.35에서 도입됐습니다. Ben Hutchings 씨([email protected])가 원안 패치들을 제출했습니다.

저자: - Tom Herbert ([email protected]) - Willem de Bruijn ([email protected])

	Softnet is a reworking of the core networking subsystem to make it fully multithreaded and more cleanly done in general

	https://www.usenix.org/legacy/publications/library/proceedings/als01/full_papers/jamal/jamal.pdf