diff --git a/LINUX/Kbuild.in b/LINUX/Kbuild.in index 4d0bf7802..988951aba 100644 --- a/LINUX/Kbuild.in +++ b/LINUX/Kbuild.in @@ -14,6 +14,7 @@ remoteobjs-$(CONFIG_NETMAP_VALE) += netmap_vale.o netmap_offloadings.o remoteobjs-$(CONFIG_NETMAP_PIPE) += netmap_pipe.o remoteobjs-$(CONFIG_NETMAP_MONITOR) += netmap_monitor.o remoteobjs-$(CONFIG_NETMAP_GENERIC) += netmap_generic.o +remoteobjs-$(CONFIG_NETMAP_PASTE) += netmap_paste.o remoteobjs-$(CONFIG_NETMAP_NULL) += netmap_null.o define remote_template diff --git a/LINUX/bsd_glue.h b/LINUX/bsd_glue.h index fe6553444..ef45600ea 100644 --- a/LINUX/bsd_glue.h +++ b/LINUX/bsd_glue.h @@ -73,6 +73,10 @@ #include #endif /* NETMAP_LINUX_HAVE_PAGE_REF */ +#ifndef NETMAP_LINUX_HAVE_PAGE_TO_VIRT +#define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) +#endif /* NETMAP_LINUX_HAVE_PAGE_TO_VIRT */ + #ifndef NETMAP_LINUX_HAVE_HRTIMER_MODE_REL #define HRTIMER_MODE_REL HRTIMER_REL #endif @@ -555,4 +559,89 @@ void netmap_bns_unregister(void); #define BIT_ULL(nr) (1ULL << (nr)) #endif /* !BIT_ULL */ +/* + * for PASTE + */ +#define curcpu smp_processor_id() +#define so_lock(_s) lock_sock(_s) +#define so_unlock(_s) release_sock(_s) +#define SOCKBUF_LOCK(sb) +#define SOCKBUF_UNLOCK(sb) +#define pause(_s, _v) usleep_range((_v) * 1000, (_v) * 1000 + 5000) +#define nm_os_sock_set_nocoalesce(_sb) + +static inline u_int +intr_disable(void) +{ + local_bh_disable(); + return 0; +} + +static inline void +intr_restore(u_int intr) +{ + local_bh_enable(); +} + +#define MBUF_L3_OFST(m) skb_network_offset(m) +#define MBUF_L4_OFST(m) skb_transport_offset(m) +#define MBUF_L3_HEADER(m) skb_network_header(m) +#define MBUF_L4_HEADER(m) skb_transport_header(m) +#define MBUF_HASNEXT(m) skb_is_nonlinear(m) +#define MBUF_FLATTEN(m) skb_linearize(m) +#define MBUF_DATA(m) (m)->data +#define MBUF_CSUM_DONE(m) ((m)->ip_summed = CHECKSUM_COMPLETE) + +#define NM_SOCK_T struct sock +#define SAVE_SOUPCALL(sk, soa) \ + (soa)->save_soupcall = (sk)->sk_data_ready +#define RESTORE_SOUPCALL(sk, soa) \ + (sk)->sk_data_ready = (void *)(soa)->save_soupcall +#define SAVE_SODTOR(sk, soa) \ + (soa)->save_sodtor = (sk)->sk_destruct +#define RESTORE_SODTOR(sk, soa) \ + (sk)->sk_destruct = (void *)(soa)->save_sodtor +#define SET_SOUPCALL(sk, f) (sk)->sk_data_ready = (void *)f +#define SET_SODTOR(sk, f) (sk)->sk_destruct = (void *)f +#define so_dtor sk_destruct +#define MBUF_HDRLEN(m) skb_headlen(m) + +/* NMCB() is only valid for mbuf populated by nm_os_build_mbuf() */ +#define NMCB(_m) ((struct nmcb *)(_m)->head) +#define NMCB_EXT(_m, _i, _bs) \ + NMCB_BUF(page_address(skb_frag_page(&skb_shinfo(_m)->frags[_i])) + \ + _bs * (skb_frag_off(&skb_shinfo(_m)->frags[_i]) / _bs)) +#define NMCB_BUF(_buf) ((struct nmcb *)(_buf)) +#define m_length(_m, _x) (_m)->len + +struct nm_ubuf_info { + struct ubuf_info ubuf; +}; + +#define nmcb_kring(nmcb) ((struct netmap_kring *)(nmcb)->ui.ubuf.ctx) +#define nmcb_slot(nmcb) ((struct netmap_slot *)(uintptr_t)(nmcb)->ui.ubuf.desc) +#define nmcbw(cb, kring, slot) do {\ + (cb)->ui.ubuf.ctx = (kring);\ + (cb)->ui.ubuf.desc = (uintptr_t)(slot);\ +} while (0) + +static inline struct pst_so_adapter * +pst_so(NM_SOCK_T *sk) +{ + return (struct pst_so_adapter *)sk->sk_user_data; +} + + /* We overwrite sk->sk_cookie as it appear not to be used */ +static inline void +pst_wso(struct pst_so_adapter *soa, NM_SOCK_T *sk) +{ + sk->sk_user_data = soa; +} + +#ifndef NETMAP_LINUX_HAVE_NETIF_RECEIVE_SKB_CORE +#define netif_receive_skb_core netif_receive_skb +#endif /* NETMAP_LINUX_HAVE_NETIF_RECEIVE_SKB_CORE */ +#ifndef NETMAP_LINUX_HAVE_SKB_FRAG_OFF +#define skb_frag_off(_f) (_f)->page_offset +#endif /* NETMAP_LINUX_HAVE_NETIF_RECEIVE_SKB_CORE */ #endif /* NETMAP_BSD_GLUE_H */ diff --git a/LINUX/configure b/LINUX/configure index fd3ef6cf0..cbab5b56d 100755 --- a/LINUX/configure +++ b/LINUX/configure @@ -95,7 +95,7 @@ setop() # available subsystems subsystem_avail="vale pipe monitor generic ptnetmap sink \ - extmem null" + extmem paste null" #enabled subsystems (bitfield) subsystem=0 @@ -323,6 +323,8 @@ Available options: --mod-name= netmap module name [$MODNAME] --enable-vale enable the VALE switch --disable-vale disable the VALE switch + --enable-paste enable netmap API for kernel TCP/IP + --disable-paste disable netmap API for kernel TCP/IP --enable-pipe enable the netmap pipes --disable-pipe disable the netmap pipes --enable-monitor enable the nemtap monitors @@ -1575,6 +1577,26 @@ EOF EOF done +# page_to_virt + add_test 'have PAGE_TO_VIRT' < + void * + dummy(struct page *page) { + return page_to_virt(page); + } +EOF + + # arguments of callback of struct ubuf_info (either 2 (old) or 3) + add_test 'define UBUF_INFO_CALLBACK_3ARGS' < + + void + dummy(struct ubuf_info *u) + { + u->callback(NULL, NULL, 1); + } +EOF + # check for third argument in qdisc enqueue callbacks add_test 'have QDISC_ENQUEUE_TOFREE' < @@ -1605,6 +1627,48 @@ EOF } EOF +# check for kernel_sendpage_locked + add_test 'have KERNEL_SENDPAGE_LOCKED' < + + int + dummy(struct sock *sk) { + return kernel_sendpage_locked(sk, NULL, 0, 0, 0); + } +EOF + +# check for netif_receive_skb_core + add_test 'have NETIF_RECEIVE_SKB_CORE' < + + int + dummy(struct sk_buff *skb) { + return netif_receive_skb_core(skb); + } +EOF + + # arguments of skb_zcopy_set (either 2 or 3) + add_test 'define SKB_ZCOPY_SET_3ARGS' < + + void + dummy(struct sk_buff *skb, struct ubuf_info *ui) + { + skb_zcopy_set(skb, ui, NULL); + } +EOF + + # arguments of skb_zcopy_set (either 2 or 3) + add_test 'have SKB_FRAG_OFF' < + + unsigned int + dummy(struct sk_buff *skb) + { + return skb_frag_off(&skb_shinfo(skb)->frags[0]); + } +EOF + # check for fault arguments add_test 'have FAULT_VMA_ARG' < @@ -1751,6 +1815,17 @@ EOF } EOF + # check for SO_TIMESTAMPING in sock + add_test 'have SO_TIMESTAMPING' < + #include + + void + dummy(struct sock *sk, struct ipcm_cookie *ipc) { + sock_tx_timestamp(sk, ipc->sockc.tsflags, &ipc->tx_flags); + } +EOF + # check for add_test 'have AVERAGE_H' < diff --git a/LINUX/netmap_linux.c b/LINUX/netmap_linux.c index 98ae19b4d..97a3c563b 100644 --- a/LINUX/netmap_linux.c +++ b/LINUX/netmap_linux.c @@ -42,6 +42,13 @@ #ifdef NETMAP_LINUX_HAVE_SCHED_MM #include #endif /* NETMAP_LINUX_HAVE_SCHED_MM */ +#ifdef WITH_PASTE +#include // sockfd_put()/fput() +#include +#include +#include // sock_owned_by_user +#include +#endif /* WITH_PASTE */ #include "netmap_linux_config.h" @@ -1083,6 +1090,404 @@ nm_os_generic_set_features(struct netmap_generic_adapter *gna) } #endif /* WITH_GENERIC */ +#ifdef WITH_PASTE + +netdev_tx_t +linux_pst_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + netmap_pst_transmit(dev, skb); + return (NETDEV_TX_OK); +} + +/* We have no way to track subsequent fragments, but such fragments + * are always sent after queueing. + * XXX !zerocopy_success might need to be handled explicitly + * zerocopy_success is false when MB_TXREF and the slot is not on-ring. + */ +void +nm_os_pst_mbuf_data_dtor( +#if defined(NETMAP_LINUX_UBUF_INFO_CALLBACK_3ARGS) + struct sk_buff *skb, +#endif + struct ubuf_info *uarg, bool zerocopy_success) +{ + struct nm_ubuf_info *u = (struct nm_ubuf_info *)uarg; + return pst_mbuf_data_dtor(container_of(u, struct nmcb, ui)); +} + +static void +nm_os_pst_mbuf_destructor(struct sk_buff *skb) +{ + struct nmcb *cb = NMCB(skb); + + if (likely(nmcb_valid(cb))) + nm_os_set_mbuf_data_destructor(skb, &cb->ui, + nm_os_pst_mbuf_data_dtor); + else + panic("invalid cb in our mbuf destructor"); +} + +void +nm_os_set_mbuf_data_destructor(struct mbuf *m, + struct nm_ubuf_info *ui, void *cb) +{ + ui->ubuf.callback = cb; + if (cb != NULL) { +#ifdef NETMAP_LINUX_SKB_ZCOPY_SET_3ARGS + bool ref = true; + skb_zcopy_set(m, (struct ubuf_info *)ui, &ref); +#else + skb_zcopy_set(m, (struct ubuf_info *)ui); +#endif + } else { + skb_zcopy_clear(m, 1); + } +} + +/* + * The socket is locked when it is detached from us. + */ +void +nm_os_pst_upcall(NM_SOCK_T *sk) +{ + struct sk_buff_head *queue = &sk->sk_receive_queue; + struct sk_buff *m; + struct netmap_kring *kring = NULL; + + rcu_read_lock(); + if (unlikely(pst_so(sk) == NULL)) + panic(" "); + /* OOO segment(s) might have been enqueued in the same rxsync round */ + while ((m = skb_peek(queue)) != NULL) { + struct nmcb *cb = NMCB(m); + struct netmap_slot *slot; + int queued = 0; + + if (unlikely(!nmcb_valid(cb))) { + panic("invalid cb"); + } + if (!kring) { + kring = nmcb_kring(cb); + /* XXX this happens when stack goes away. + * We need better workaround */ + if (unlikely(!kring)) { + PST_DBG("WARNING: no kring"); + SET_MBUF_DESTRUCTOR(m, NULL); + nm_os_set_mbuf_data_destructor(m, &cb->ui, NULL); + __skb_unlink(m, queue); + __kfree_skb(m); + continue; + } + mtx_lock(&kring->q_lock); + } else if (unlikely(nmcb_kring(cb) != kring)) { + panic("different krings"); + } + /* append this buffer to the scratchpad */ + slot = nmcb_slot(cb); + if (unlikely(slot == NULL)) { + PST_DBG("m %p no slot", m); + continue; + } + /* too expensive */ +#if 0 + if (!pst_slot_in_extra(slot, kring) && + !pst_slot_in_kring(slot, kring)) { + PST_DBG("invalid slot"); + continue; + } +#endif /* 0 */ + if (unlikely(m->sk == NULL || pst_so(m->sk) == NULL)) { + PST_DBG("m->sk %p soa %p", + m->sk, m->sk ? pst_so(m->sk) : NULL); + continue; + } + nm_pst_setfd(slot, pst_so(sk)->fd); + nm_pst_setdoff(slot, (uint16_t) + skb_headroom(m) - nm_get_offset(kring, slot)); + slot->len = skb_headlen(m) + nm_pst_getdoff(slot); + /* + * We might have leftover for the previous connection with + * the same fd value. Overwrite it if this is new connection. + */ + pst_fdt_add(cb, kring); + /* see comment in pst_transmit() */ +#ifdef PST_MB_RECYCLE + if (unlikely(nmcb_rstate(cb) == MB_QUEUED)) { + queued = 1; + } +#endif + + nmcb_wstate(cb, MB_FTREF); + + /* XXX use new sk_eat_skb() > 5.1 */ + __skb_unlink(m, queue); +#ifdef PST_MB_RECYCLE + if (likely(!queued)) { + skb_orphan(m); + } else +#endif + __kfree_skb(m); + } + if (kring) + mtx_unlock(&kring->q_lock); + rcu_read_unlock(); +} + +NM_SOCK_T * +nm_os_sock_fget(int fd, void **f) +{ + int err; + struct socket *sock = sockfd_lookup(fd, &err); + + return sock ? sock->sk : NULL; +} + +void +nm_os_sock_fput(NM_SOCK_T *sk, void *dummy) +{ + sockfd_put(sk->sk_socket); +} + +int +nm_os_pst_sbdrain(struct netmap_adapter *na, NM_SOCK_T *sk) +{ + struct mbuf *m; + + /* XXX All the packets must be originated from netmap */ + m = skb_peek(&sk->sk_receive_queue); + if (!m) { + return 0; + } + else if (!nmcb_valid(NMCB(m))) { + return 0; + } + /* No need for BDG_RLOCK() - we don't move packets to pst na */ + nm_os_pst_upcall(sk); + return 0; +} + +int +nm_os_pst_mbuf_extadj(struct mbuf *m, int i, int off) +{ + if (unlikely(skb_shinfo(m)->nr_frags <= i)) + return -1; + skb_frag_off_add(&skb_shinfo(m)->frags[i], off); + return 0; +} + +int +nm_os_sock_dead(NM_SOCK_T *so) +{ + return !!sock_flag(so, SOCK_DEAD); +} + +static inline int +nm_os_mbuf_valid(struct mbuf *m) +{ + return likely(*(int *)(&m->users) != 0); +} + +static struct mbuf * +nm_os_build_mbuf(struct netmap_kring *kring, char *buf, u_int len) +{ + struct netmap_adapter *na = kring->na; + struct mbuf *m; + struct page *page; + const int alen = NETMAP_BUF_SIZE(na) - sizeof(struct nmcb); + const u_int offset = nm_get_offset(kring, nmcb_slot(NMCB_BUF(buf))); + +#ifdef PST_MB_RECYCLE + m = kring->tx_pool[1]; + if (m) { + struct skb_shared_info *shinfo; + + /* XXX maybe build_skb_around with some overheads */ + *m = *kring->tx_pool[0]; + m->head = m->data = buf; + skb_reset_tail_pointer(m); + shinfo = skb_shinfo(m); + bzero(shinfo, offsetof(struct skb_shared_info, dataref)); + *(int *)(&shinfo->dataref) = 1; + //shinfo->tx_flags |= SKBTX_DEV_ZEROCOPY; + } else +#endif + { + m = build_skb(buf, alen); + m->dev = na->ifp; + } + if (unlikely(!m)) + return NULL; +#ifdef PST_MB_RECYCLE + else if (unlikely(!nm_os_mbuf_valid(kring->tx_pool[0]))) { + *kring->tx_pool[0] = *m; + } +#endif + page = virt_to_page(buf); + page_ref_add(page, 1); // survive __kfree_skb() + skb_reserve(m, offset); // m->data and tail + skb_put(m, len - offset); // advance m->tail and m->len + return m; +} + +int +nm_os_pst_rx(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct netmap_adapter *na = kring->na; + char *p = NMB(na, slot); + struct nmcb *cb = NMCB_BUF(p); + struct mbuf *m; + int ret = 0; + + m = nm_os_build_mbuf(kring, p, nm_get_offset(kring, slot) + slot->len); + if (unlikely(!m)) + return 0; // drop and skip + + pst_get_extra_ref(nmcb_kring(cb)); + + nmcb_wstate(cb, MB_STACK); + nm_pst_setfd(slot, 0); + m->protocol = eth_type_trans(m, m->dev); + /* have orphan() set data_destructor */ + SET_MBUF_DESTRUCTOR(m, nm_os_pst_mbuf_destructor); + netif_receive_skb_core(m); + + /* setting data destructor is safe only after skb_orphan_frag() + * in __netif_receive_skb_core(). + */ + if (unlikely(nmcb_rstate(cb) == MB_STACK)) { + nmcb_wstate(cb, MB_QUEUED); + if (pst_extra_enq(kring, slot)) + ret = -EBUSY; + } +#ifdef PST_MB_RECYCLE + /* XXX avoid refcount_read... */ + if (nmcb_rstate(cb) == MB_FTREF && likely(!skb_shared(m))) { + /* we can recycle this mbuf (see nm_os_pst_data_ready) */ + struct ubuf_info *uarg = skb_shinfo(m)->destructor_arg; + + if (likely(uarg->callback)) +#if defined(NETMAP_LINUX_UBUF_INFO_CALLBACK_3ARGS) + uarg->callback(NULL, uarg, true); +#else + uarg->callback(uarg, true); +#endif + kring->tx_pool[1] = m; + } else + kring->tx_pool[1] = NULL; +#endif + return ret; +} + +int +nm_os_pst_tx(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct netmap_adapter *na = kring->na; + struct pst_so_adapter *soa; + struct nmcb *cb; + struct page *page; + u_int poff, len; + NM_SOCK_T *sk; + void *nmb; + int err, pageref = 0; + const u_int pst_offset = nm_pst_getdoff(slot); + const int flags = MSG_DONTWAIT; + + nmb = NMB(na, slot); + soa = pst_soa_from_fd(na, nm_pst_getfd(slot)); + if (unlikely(!soa)) { + PST_DBG("no soa of fd %d", nm_pst_getfd(slot)); + return 0; + } + sk = soa->so; + + page = virt_to_page(nmb); + get_page(page); // survive __kfree_skb() + pageref = page_ref_count(page); + cb = NMCB_BUF(nmb); + poff = nmb - page_to_virt(page) + + nm_get_offset(kring, slot) + pst_offset; + len = slot->len - pst_offset; + nmcb_wstate(cb, MB_STACK); + + if (unlikely(!sk)) { + PST_DBG("NULL sk"); + nmcb_invalidate(cb); + return 0; + } else if (unlikely(!sk->sk_socket)) { + PST_DBG("NULL sk->sk_socket"); + nmcb_invalidate(cb); + return 0; + } + +#ifdef NETMAP_LINUX_HAVE_KERNEL_SENDPAGE_LOCKED + /* + * We don't really own lock. But since we only actively receive packets, + * the RX path never tries to lock the socket. + * If the kernel is configured to detect incorrect locking, disable + * paste_optim_sendpage. + */ + if (paste_optim_sendpage) + err = kernel_sendpage_locked(sk, page, poff, len, flags); + else +#endif /* NETMAP_LINUX_HAVE_KERNEL_SENDPAGE_LOCKED */ + err = kernel_sendpage(sk->sk_socket, page, poff, len, flags); + if (unlikely(err < 0)) { + /* XXX check if it is enough to assume EAGAIN only */ + PST_DBG_LIM("error %d in sendpage() slot %ld fd %d", + err, slot - kring->ring->slot, soa->fd); + return err; + } + + if (unlikely(nmcb_rstate(cb) == MB_STACK)) { + /* The stack might have just dropped a page reference (e.g., + * linearized in skb_checksum_help() in __dev_queue_xmit(). + */ + if (unlikely(pageref == page_ref_count(page))) { + PST_DBG("dropped frag ref (fd %d)", nm_pst_getfd(slot)); + nmcb_invalidate(cb); + return 0; + } + nmcb_wstate(cb, MB_QUEUED); + + if (likely(pst_extra_enq(kring, slot))) + return -EBUSY; + } /* usually MB_TXREF (TCP) or MB_NOREF (UDP) */ + return 0; +} + +/* Since tcp_sock_set_nodelay locks socket by itself. Since we don't + * need push_pending_frames, just set the flag manually. + */ +int +nm_os_set_nodelay(NM_SOCK_T *so) +{ + tcp_sk(so)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + return 0; // FreeBSD returns status +} + +int +nm_os_kthread_add(void *f, void *arg, void *proc, struct thread **tdptr, + int flags, int pages, const char *fmt) +{ + *tdptr = (struct thread *)kthread_create(f, arg, "netmap-pst-kwait"); + wake_up_process((struct task_struct *)*tdptr); + return 0; +} + +int +nm_os_hwcsum_ok(struct netmap_adapter *na) +{ + return na->ifp->features & NETIF_F_CSUM_MASK; +} + +int +nm_os_so_connected(NM_SOCK_T *so) +{ + return so->sk_socket->state == SS_CONNECTED; +} + +#endif /* WITH_PASTE */ + /* Use ethtool to find the current NIC rings lengths, so that the netmap rings can have the same lengths. */ int diff --git a/apps/phttpd/Makefile b/apps/phttpd/Makefile new file mode 100644 index 000000000..a0ef21ec6 --- /dev/null +++ b/apps/phttpd/Makefile @@ -0,0 +1,70 @@ +#CFLAGS += -Werror -Wall -O2 -g -fsanitize=address +CFLAGS += -Werror -Wall -O2 -g +NMLIB = ../../libnetmap/libnetmap.a +LSMLIB = ../../../lsm_nvm +NOCFLAGS := $(CFLAGS) +NOCFLAGS += -DNOLIBNETMAP -DLIBNETMAP_NOTHREADSAFE + +LDFLAGS += -lpthread -lm +UNAME := $(shell uname) +ifeq ($(UNAME), Linux) +LDFLAGS += -lbsd +CC = gcc +else +CC = clang +endif +NOLDFLAGS := $(LDFLAGS) +LDFLAGS += $(NMLIB) +#CFLAGS += -I/usr/local/include -I../libsqlite/include +OBJS = phttpd-b.o phttpd-f.o bplus_support.o bplus_impl.o phttpd-l.o phttpd-o.o +SPATH ?= -I../../sys/ -I../include -I../../libnetmap -DNETMAP_WITH_LIBS +BOPT = -DWITH_BPLUS -I./ +FOPT = -DWITH_NOFLUSH +OOPT = -DWITH_CLFLUSHOPT -mclflushopt + +LOPT = -DWITH_LEVELDB -I$(LSMLIB)/include +LLDOPT = $(LSMLIB)/out-static/libleveldb.a -lnuma + +# +# compile phttpd-o only if your CPU supports clflushopt +# +PROG = phttpd phttpd-b phttpd-f nophttpd phttpd-o +ALLPROG = $(PROG) phttpd-l phttpd-o + +all: $(PROG) + +#test_nvdimm: test_nvdimm.o +# $(CC) $(CFLAGS) -o test_nvdimm test_nvdimm.o $(LDFLAGS) $(EXTRA_LDFLAGS) +#test_nvdimm.o: test_nvdimm.c nmlib.h +# $(CC) $(CFLAGS) $(OPT) $(SOPT) $(SPATH) -c test_nvdimm.c -o test_nvdimm.o $(EXTRA_CFLAGS) + +nophttpd: phttpd.c nmlib.h + $(CC) $(NOCFLAGS) $(SPATH) phttpd.c -o nophttpd $(NOLDFLAGS) + +phttpd: phttpd.c nmlib.h + $(CC) $(CFLAGS) $(OPT) $(SOPT) $(SPATH) phttpd.c -o phttpd $(EXTRA_CFLAGS) $(LDFLAGS) + +phttpd-o: phttpd-o.o + $(CC) $(CFLAGS) -o phttpd-o phttpd-o.o $(LDFLAGS) $(EXTRA_CFLAGS) +phttpd-o.o: phttpd.c nmlib.h + $(CC) $(CFLAGS) $(OPT) $(SOPT) $(OOPT) $(SPATH) -c phttpd.c -o phttpd-o.o $(EXTRA_CFLAGS) + +phttpd-f: phttpd-f.o + $(CC) $(CFLAGS) -o phttpd-f phttpd-f.o $(LDFLAGS) $(EXTRA_CFLAGS) +phttpd-f.o: phttpd.c nmlib.h + $(CC) $(CFLAGS) $(OPT) $(SOPT) $(FOPT) $(SPATH) -c phttpd.c -o phttpd-f.o $(EXTRA_CFLAGS) + +phttpd-b: phttpd-b.o bplus_support.o bplus_impl.o + $(CC) $(CFLAGS) -o phttpd-b phttpd-b.o bplus_impl.o bplus_support.o $(LDFLAGS) $(EXTRA_CFLAGS) +phttpd-b.o: phttpd.c nmlib.h bplus_common.h bplus_support.h + $(CC) $(CFLAGS) $(OPT) $(SOPT) $(BOPT) $(SPATH) -c phttpd.c -o phttpd-b.o $(EXTRA_CFLAGS) +bplus_impl.o: bplus_impl.c + $(CC) $(CFLAGS) $(BOPT) -c bplus_impl.c +bplus_support.o: bplus_support.c + $(CC) $(CFLAGS) $(BOPT) -c bplus_support.c +phttpd-l: phttpd-l.o + $(CC) $(CFLAGS) -o phttpd-l phttpd-l.o $(LDFLAGS) $(EXTRA_CFLAGS) $(LLDOPT) +phttpd-l.o: phttpd.c nmlib.h + $(CC) $(CFLAGS) $(OPT) $(SOPT) $(LOPT) $(SPATH) -c phttpd.c -o phttpd-l.o $(EXTRA_CFLAGS) +clean: + -@rm -f $(ALLPROG) $(OBJS) diff --git a/apps/phttpd/bplus_common.h b/apps/phttpd/bplus_common.h new file mode 100644 index 000000000..da5499da6 --- /dev/null +++ b/apps/phttpd/bplus_common.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2004 Douglas Santry + * All rights reserved. + * + */ + +#ifndef __BTREE__H__ +#define __BTREE__H__ + +#include + +#define TREE_TYPE uint64_t + +#define ASSERT assert +#define VERIFY assert + +#define BTREE_MAX_DEPTH 6 /* Maximum depth of tree */ + +typedef uint64_t btree_key; +#define BTREE_KEY_MAX ((btree_key) 0x7fffffffffffffffll) + +/* + * Type for path through the tree... + * + */ + +typedef struct { + + vbn_t pt_bno; + uint32_t pt_index; + +} path_node; + +/* + * This is the type used in tree lookups. All trees will have + * to include this as the first entry in their lookup type. + * + */ +typedef struct { + + /* IN */ + btree_key bt_key; /* Looking for... */ + TREE_TYPE bt_data; /* For insert */ + TREE_TYPE bt_range; /* For range */ + uint32_t bt_intent; /* Plans for key, if found */ + gfile_t *bt_vp; /* inode for tree */ + + /* Stack of nodes traversed; do not touch */ + path_node bt_path[BTREE_MAX_DEPTH]; + + /* OUT */ + int32_t bt_rc; + gbuf_t *bt_bp; + int bt_index; + TREE_TYPE *bt_datap; /* Only valid when LFLAG_FAST is set */ + TREE_TYPE *bt_rangep; /* Only valid when LFLAG_FAST is set */ +} btree_lookup_args; + +#define BTREE_LFLAG_LOOKUP 0x0001 /* No change of state planned */ +#define BTREE_LFLAG_INSERT 0x0002 /* We are inserting an element */ +#define BTREE_LFLAG_DELETE 0x0004 /* Deleteing an element */ +#define BTREE_LFLAG_RANGE 0x0010 /* Range search, reverse on fail */ +#define BTREE_LFLAG_UPDATE 0x0020 /* Update entry */ +#define BTREE_LFLAG_BUFFER 0x0040 /* Return locked buffer */ +#define BTREE_LFLAG_FAST 0x0080 /* No buffer locking */ + +#define BTREE_ARGS_SET_LOOKUP(X, KEY, VP) { (X).bt_key = (btree_key) KEY; \ + (X).bt_intent = BTREE_LFLAG_LOOKUP; \ + (X).bt_vp = VP; } + +#define BTREE_ARGS_SET_INSERT(X, KEY, DATA, VP) { (X).bt_key = (btree_key) KEY; \ + (X).bt_data = DATA; \ + (X).bt_intent = BTREE_LFLAG_INSERT; \ + (X).bt_vp = VP; } + +#define BTREE_ARGS_SET_UPDATE(X, KEY, DATA, VP) { (X).bt_key = (btree_key) KEY; \ + (X).bt_data = DATA; \ + (X).bt_intent = BTREE_LFLAG_UPDATE; \ + (X).bt_vp = VP; } + +#define BTREE_ARGS_SET_DELETE(X, KEY, VP) { (X).bt_key = (btree_key) KEY; \ + (X).bt_intent = BTREE_LFLAG_DELETE; \ + (X).bt_vp = VP; } + +#define BTREE_RC_FOUND 0 +#define BTREE_RC_NOTFOUND 2 +#define BTREE_RC_INSERTED -2 +#define BTREE_RC_DELETED -3 +#define BTREE_RC_DONE -4 +#define BTREE_RC_ERROR EIO +#define BTREE_RC_RANGE -6 +#define BTREE_RC_NA -7 + +int btree_lookup (gfile_t *, btree_key, TREE_TYPE *); +int btree_insert (gfile_t *, btree_key, TREE_TYPE); +int btree_create_btree (char *, gfile_t **); + +void +btree_paranoid(gfile_t *, vbn_t, btree_key); + +#endif /* header inclusion */ diff --git a/apps/phttpd/bplus_impl.c b/apps/phttpd/bplus_impl.c new file mode 100644 index 000000000..3c508ef5b --- /dev/null +++ b/apps/phttpd/bplus_impl.c @@ -0,0 +1,1315 @@ +/* + * Copyright (C) 2004 Douglas Santry + * All rights reserved. + * + */ + +#include +#include +#include +#include + +#include +#include + +#define xmove(SRC, DEST, X) memmove (DEST, SRC, X) + +#ifndef TREE_LOCK +#define TREE_LOCK(X) +#define TREE_UNLOCK(X) +#endif + +typedef enum { + BINSERT, + BPUSHLEFT, + BLPIVOT, + BPULLLEFT, + BPUSHRIGHT, + BRPIVOT, + BPULLRIGHT, + BSPLIT, + BSCREATE, + BUPDATE, + BHEIGHT, + BDELETE +} btree_lastop; + +#define BTREE_ROOT_ADDR 0xffffffff /* uint32_t infinity */ +#define BTREE_ROOT_FBN 0 + +#define BTREE_HDR_MAGIC 0xa55a5aa5 + +/* + * Each block in the tree has one of these headers describing it. + * + */ +typedef struct { + + uint32_t bm_magic; + uint32_t bm_fbn; + uint32_t bm_serialno; + uint16_t bm_nkeys; + uint16_t bm_level; + + vbn_t bm_next; + vbn_t bm_prev; + btree_lastop bm_lop; + btree_key bm_keys[1]; +} btree_meta; + +typedef union { + + TREE_TYPE *bu_data; + vbn_t *bu_children; +} btree_data; + +/* + * Anchor for the linked list of interior nodes in a B+ tree (NULL fbn) + * + */ +#define NODE_ANCHOR 0xffffffff + +/* + * N is the maximum numnber of keys in a node. + * + */ + +#define BTREE_N0 ((TREE_BSIZE - sizeof(btree_meta) - sizeof(TREE_TYPE)) / \ + (sizeof(btree_key) + sizeof(TREE_TYPE))) + +#define BTREE_NX ((TREE_BSIZE - sizeof(btree_meta) - sizeof(vbn_t)) / \ + (sizeof(btree_key) + sizeof(vbn_t))) + +#define BTREE_N(X) ((X)->bm_level == 0 ? BTREE_N0 : BTREE_NX) + +#define OVERFLOW_PRED(X) ((X)->bm_nkeys == BTREE_N(X)) + +/* + * Threshold that determines the choice between splitting and shifting + * on the insert case for an interior node. + * + * We shift if the amount of free space is 11% or greater + * + */ +#define SHIFT_PRED0(X) ((BTREE_N0 / (BTREE_N0 - (X)->bm_nkeys)) <= 9) +#define SHIFT_PREDX(X) ((BTREE_NX / (BTREE_NX - (X)->bm_nkeys)) <= 9) +#define SHIFT_PRED(X) ((X)->bm_level == 0 ? SHIFT_PRED0(X) : SHIFT_PREDX(X)) + +#define INIT_BTREE_NODE(BP, HDR, TABLE) { \ + HDR = ((btree_meta *) (BP)->b_data); \ + TABLE.bu_data = ((TREE_TYPE *) (HDR->bm_keys + BTREE_N(HDR))); } + +static uint32_t serialno = 0; + +static int +btree_lookup_work(vbn_t, btree_lookup_args *); + +static vbn_t +btree_split_node(gbuf_t *original_bp, btree_key *new_key); + +static void +btree_increase_height(gbuf_t *root, vbn_t, btree_key key); + +static int +btree_shift_right(gbuf_t *left_bp, btree_lookup_args *); + +static int +btree_shift_left(gbuf_t *left_bp, btree_lookup_args *); + +static int +btree_intra_lookup(gbuf_t *, btree_key); + +static void +btree_insert_modify(btree_lookup_args *, btree_key, TREE_TYPE *, int); + +static void +btree_delete_modify(btree_lookup_args *, int); + +static int +btree_entry (btree_lookup_args *); + +#ifdef BTREE_OVERWRITE +static int +btree_overwrite(btree_lookup_args *); +#endif + +#if 0 +static void +btree_collapse_node(gbuf_t *); +#endif + +#if 0 +#define D printf("%d\t@%d\n", ME, __LINE__) +#endif +#define D + +extern int busy_bufs[]; + +static vbn_t +btree_grow(gfile_t *); + +#ifdef BTREE_ITER +static void btree_traverse_reset (gfile_t *vp); +static TREE_TYPE *btree_traverse_next (gfile_t *vp); +#endif + +int +btree_entry (btree_lookup_args *cookie) +{ + int rc; + + (void)rc; + TREE_LOCK(cookie->bt_vp); + + if ((cookie->bt_intent & BTREE_LFLAG_LOOKUP) == 0) serialno++; + + cookie->bt_rc = BTREE_RC_NA; + cookie->bt_bp = 0; + cookie->bt_index = -1; + + rc = btree_lookup_work(BTREE_ROOT_FBN, cookie); + ASSERT(cookie->bt_rc != BTREE_RC_NA); + +#if 0 + ASSERT(busy_bufs[ME] == 0); +#endif + + TREE_UNLOCK(cookie->bt_vp); + + return cookie->bt_rc; +} + +int +btree_lookup_work(vbn_t bno, btree_lookup_args *cookie) +{ + gbuf_t *bp; + btree_meta *b_hdr; + btree_data b_table; + vbn_t fbn; + int cursor; + int rc = 0; + int keep_bp_ref = 0; +#ifdef PARANOID + int i; +#endif + + bp = bread(cookie->bt_vp, bno); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + + /* + * Mark top of path. + * + */ + if (bno == BTREE_ROOT_FBN) { + + cookie->bt_path[b_hdr->bm_level+1].pt_index = BTREE_ROOT_ADDR; + cookie->bt_path[b_hdr->bm_level+1].pt_bno = BTREE_ROOT_ADDR; + } + + cookie->bt_path[b_hdr->bm_level].pt_bno = bno; + + cursor = btree_intra_lookup(bp, cookie->bt_key); + ASSERT((0 <= cursor) && (cursor <= b_hdr->bm_nkeys)); + cookie->bt_path[b_hdr->bm_level].pt_index = cursor; + + if (b_hdr->bm_level) { + + fbn = b_table.bu_children[cursor]; + + brelse(bp); + rc = btree_lookup_work(fbn, cookie); + } else { + + if (cursor < b_hdr->bm_nkeys && + b_hdr->bm_keys[cursor] == cookie->bt_key) { + + if (cookie->bt_intent & BTREE_LFLAG_BUFFER) { + + cookie->bt_bp = bp; + cookie->bt_index = cursor; + keep_bp_ref = 1; + } + + if (cookie->bt_intent & BTREE_LFLAG_DELETE) { + + brelse(bp); + btree_delete_modify(cookie, 0); + cookie->bt_rc = BTREE_RC_DELETED; + + } else if (cookie->bt_intent & BTREE_LFLAG_UPDATE) { + + b_table.bu_data[cursor] = cookie->bt_data; + cookie->bt_rc = BTREE_RC_DONE; + bdwrite(bp); + + } else if (cookie->bt_intent & BTREE_LFLAG_FAST) { + + /* + * We are returning a pointer to buffer + * memory. Only safe in single-threaded + * applications with no intervening + * non-idempotent operations. + * + */ + cookie->bt_datap = &b_table.bu_data[cursor]; + if (!keep_bp_ref) brelse(bp); + cookie->bt_rc = BTREE_RC_FOUND; + } else { + + cookie->bt_data = b_table.bu_data[cursor]; + cookie->bt_rc = BTREE_RC_FOUND; + if (!keep_bp_ref) brelse(bp); + } + } else if (cookie->bt_intent & BTREE_LFLAG_INSERT) { + + brelse(bp); + btree_insert_modify(cookie, + cookie->bt_key, + &cookie->bt_data, + 0); + cookie->bt_rc = BTREE_RC_INSERTED; + } else if (cookie->bt_intent & BTREE_LFLAG_RANGE && + b_hdr->bm_nkeys) { + + cookie->bt_rc = BTREE_RC_RANGE; + + if (cookie->bt_intent & BTREE_LFLAG_BUFFER) { + + cookie->bt_bp = bp; + cookie->bt_index = cursor; + keep_bp_ref = 1; + } + + cookie->bt_key = b_hdr->bm_keys[cursor]; + + if (cookie->bt_intent & BTREE_LFLAG_FAST) + cookie->bt_datap = &b_table.bu_data[cursor]; + else + cookie->bt_data = b_table.bu_data[cursor]; + + if (cursor == 0) { + + fbn = b_hdr->bm_prev; + if (!keep_bp_ref) brelse(bp); + if ((u_int)fbn == NODE_ANCHOR) { + + cookie->bt_rangep = 0; + return rc; + } + bp = bread(cookie->bt_vp, fbn); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + cursor = b_hdr->bm_nkeys - 1; + } else cursor--; + + if (cookie->bt_intent & BTREE_LFLAG_FAST) + cookie->bt_rangep = &b_table.bu_data[cursor]; + else + cookie->bt_range = b_table.bu_data[cursor]; + + if (!keep_bp_ref) brelse(bp); + } else { + + cookie->bt_rc = BTREE_RC_NOTFOUND; + brelse(bp); + } + } + + return rc; +} + +/* + * This routine recursively propagates changes from leaves to the root + * of a tree. It keeps splitting and shifting its way up until we hit + * the root or a node absorbs the insertion without overflowing. + * + */ + +void +btree_insert_modify(btree_lookup_args *cookie, + btree_key key, + TREE_TYPE *payload, + int level) +{ +#if 0 + extern int nleaves; +#endif + gbuf_t *left_bp=0, *right_bp=0; + gbuf_t *bp; + vbn_t greater_bno; + btree_meta *b_hdr, *fixup_hdr; + btree_data b_table, fixup_table; + btree_key new_key; + int stop_here; + int rc; + + (void)fixup_table; + bp = bread(cookie->bt_vp, cookie->bt_path[level].pt_bno); + stop_here = cookie->bt_path[level].pt_index; + + VERIFY(bp); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + ASSERT(b_hdr->bm_level == level); + + if (b_hdr->bm_nkeys == 0) { + + VERIFY(bp->b_blkno == BTREE_ROOT_FBN); + + b_hdr->bm_nkeys = 1; + b_hdr->bm_keys[0] = key; + b_table.bu_data[0] = *payload; + + bdwrite(bp); + + return; + } + + /* + * We keep the keys in sorted order. Having found the + * appropriate point to insert (stop_here) we shift everything + * over to make room. + * + */ + + xmove(b_hdr->bm_keys + stop_here, + b_hdr->bm_keys + stop_here + 1, + sizeof(btree_key) * (b_hdr->bm_nkeys - stop_here)); + + b_hdr->bm_keys[stop_here] = key; + + if (level > 0) { + xmove(b_table.bu_children + stop_here + 1, + b_table.bu_children + stop_here + 2, + sizeof(vbn_t) * (b_hdr->bm_nkeys - stop_here)); + b_table.bu_children[stop_here + 1] = *(vbn_t *) payload; + } else { + xmove(b_table.bu_data + stop_here, + b_table.bu_data + stop_here + 1, + sizeof(TREE_TYPE) * (b_hdr->bm_nkeys - stop_here)); + b_table.bu_data[stop_here] = *payload; + } + + b_hdr->bm_nkeys++; + + b_hdr->bm_lop = BINSERT; + b_hdr->bm_serialno = serialno; + + if (OVERFLOW_PRED(b_hdr)) { + + if ((u_int)b_hdr->bm_prev != NODE_ANCHOR) { + + left_bp = bread(cookie->bt_vp, b_hdr->bm_prev); + INIT_BTREE_NODE(left_bp, fixup_hdr, fixup_table); + + if (SHIFT_PRED(fixup_hdr)) { + + brelse(left_bp); + rc = btree_shift_left(bp, cookie); + if (rc == 0) return; + } else brelse(left_bp); + } + + if ((u_int)b_hdr->bm_next != NODE_ANCHOR) { + + right_bp = bread(cookie->bt_vp, b_hdr->bm_next); + INIT_BTREE_NODE(right_bp, fixup_hdr, fixup_table); + + if (SHIFT_PRED(fixup_hdr)) { + + brelse(right_bp); + rc = btree_shift_right(bp, cookie); + if (rc == 0) return; + } else brelse(right_bp); + } + + greater_bno = btree_split_node(bp, &new_key); + + if (cookie->bt_path[level+1].pt_index != BTREE_ROOT_ADDR) + btree_insert_modify(cookie, + new_key, + (TREE_TYPE *) &greater_bno, + level + 1); + else /* we are the root */ + btree_increase_height(bp, greater_bno, new_key); + } + + bdwrite(bp); +} + +vbn_t +btree_split_node(gbuf_t *original_bp, + btree_key *new_key) +{ + gfile_t *vp; + gbuf_t *sibling_bp; + gbuf_t *right_bp; + btree_meta *original_hdr; + btree_data original_table; + btree_meta *sibling_hdr; + btree_data sibling_table; + int lesser_length; + int greater_length; + vbn_t new_bno; + + vp = original_bp->b_vp; + new_bno = btree_grow(vp); + + INIT_BTREE_NODE(original_bp, original_hdr, original_table); + ASSERT(original_hdr->bm_magic == BTREE_HDR_MAGIC); + original_hdr->bm_lop = BSPLIT; + original_hdr->bm_serialno = serialno; + + sibling_bp = bread(vp, new_bno); + VERIFY(sibling_bp); + INIT_BTREE_NODE(sibling_bp, sibling_hdr, sibling_table); + + if ((u_int)original_hdr->bm_next != NODE_ANCHOR) { + + right_bp = bread(vp, original_hdr->bm_next); + + ((btree_meta *) (right_bp->b_data))->bm_prev = + sibling_bp->b_blkno; + ((btree_meta *) (right_bp->b_data))->bm_serialno = serialno; + ((btree_meta *) (right_bp->b_data))->bm_lop = BUPDATE; + bdwrite(right_bp); + } + + *sibling_hdr = *original_hdr; + INIT_BTREE_NODE(sibling_bp, sibling_hdr, sibling_table); + original_hdr->bm_next = sibling_bp->b_blkno; + sibling_hdr->bm_prev = original_bp->b_blkno; + sibling_hdr->bm_fbn = sibling_bp->b_blkno; + sibling_hdr->bm_lop = BSCREATE; + + lesser_length = original_hdr->bm_nkeys >> 1; + greater_length = original_hdr->bm_nkeys - lesser_length; + + xmove(original_hdr->bm_keys + lesser_length, + sibling_hdr->bm_keys, greater_length * sizeof(btree_key)); + + if (original_hdr->bm_level) { + + xmove(original_table.bu_children + lesser_length, + sibling_table.bu_children, + (greater_length + 1) * sizeof(vbn_t)); + + lesser_length--; + *new_key = original_hdr->bm_keys[lesser_length]; + } else { + xmove(original_table.bu_data + lesser_length, + sibling_table.bu_data, + greater_length * sizeof(TREE_TYPE)); + + *new_key = sibling_hdr->bm_keys[0]; + } + + original_hdr->bm_nkeys = lesser_length; + sibling_hdr->bm_nkeys = greater_length; + + bdwrite(sibling_bp); + + return new_bno; +} + +/* + * If the root node of the tree overflows we can't just split it. We need + * to increase the height of the tree. We keep the root of the tree fbn 0. + * + */ +void +btree_increase_height(gbuf_t *root, + vbn_t greater_bno, + btree_key key) +{ + vbn_t new_bno; + gbuf_t *greater_bp; + gbuf_t *will_be_lesser_bp; + btree_meta *b_hdr; + btree_data b_table; + + new_bno = btree_grow(root->b_vp); + + will_be_lesser_bp = bread(root->b_vp, new_bno); + greater_bp = bread(root->b_vp, greater_bno); + + /* + * DJS_debug - this needs to be a page flip. + * + */ + xmove(root->b_data, will_be_lesser_bp->b_data, TREE_BSIZE); + INIT_BTREE_NODE(greater_bp, b_hdr, b_table); + b_hdr->bm_prev = will_be_lesser_bp->b_blkno; + b_hdr->bm_serialno = serialno; + + INIT_BTREE_NODE(will_be_lesser_bp, b_hdr, b_table); + b_hdr->bm_fbn = will_be_lesser_bp->b_blkno; + b_hdr->bm_serialno = serialno; + + INIT_BTREE_NODE(root, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + b_hdr->bm_level++; + if (b_hdr->bm_level == 1) + INIT_BTREE_NODE(root, b_hdr, b_table); + b_hdr->bm_lop = BHEIGHT; + b_hdr->bm_serialno = serialno; + +#if 0 + /* + * Leaves look subtly different from interior nodes. Their + * key/payload ratios differ. We determine if we are growing + * from a one node tree here and inflect the format of the + * the node. + * + */ + if (b_hdr->bm_level == 1) { + + /* + * Get every thing pointing to the proper area and + * + */ + INIT_BTREE_NODE(root, b_hdr, b_table); + } +#endif + + b_hdr->bm_nkeys = 1; + b_hdr->bm_next = NODE_ANCHOR; + ASSERT((u_int)b_hdr->bm_prev == NODE_ANCHOR); + + b_hdr->bm_keys[0] = key; + b_table.bu_children[0] = will_be_lesser_bp->b_blkno; + b_table.bu_children[1] = greater_bp->b_blkno; + + bdwrite(will_be_lesser_bp); + bdwrite(greater_bp); +} + +int +btree_shift_left(gbuf_t *right_bp, btree_lookup_args *cookie) +{ + gbuf_t *parent_bp; + gbuf_t *left_bp; + btree_meta *left_hdr, *right_hdr, *parent_hdr; + btree_data left_table, right_table, parent_table; + int space; + int level; + int index; + int non_leaf=1; + + (void)parent_table; + INIT_BTREE_NODE(right_bp, right_hdr, right_table); + ASSERT(right_hdr->bm_magic == BTREE_HDR_MAGIC); + + right_hdr->bm_lop = BPUSHLEFT; + + level = right_hdr->bm_level + 1; + + parent_bp = bread(cookie->bt_vp, cookie->bt_path[level].pt_bno); + INIT_BTREE_NODE(parent_bp, parent_hdr, parent_table); + ASSERT(parent_hdr->bm_magic == BTREE_HDR_MAGIC); + + index = cookie->bt_path[level].pt_index; + + if (index == 0) { + + brelse(parent_bp); + return -1; /* leaves do not have a common parent */ + } + + parent_hdr->bm_serialno = serialno; + parent_hdr->bm_lop = BLPIVOT; + + /* + * This index is relative to children; Needs to be translated + * to the key name space. + * + */ + if (index) index--; + + left_bp = bread(cookie->bt_vp, right_hdr->bm_prev); + VERIFY(left_bp); + INIT_BTREE_NODE(left_bp, left_hdr, left_table); + ASSERT(left_hdr->bm_magic == BTREE_HDR_MAGIC); + left_hdr->bm_lop = BPULLLEFT; + left_hdr->bm_serialno = serialno; + + /* + * We'll take half of the space to shift in. + * + */ + space = (right_hdr->bm_nkeys - left_hdr->bm_nkeys) >> 1; + + if (right_hdr->bm_level == 0) non_leaf = 0; + + /* + * Move the keys over. + * + * i) demote the parent key to the left-most position on the left + * ii) move the keys from the right + * iii) promote the left-most key in the right node + * iv) shift every thing in the right node to the beginning + * + */ + + left_hdr->bm_keys[left_hdr->bm_nkeys] = parent_hdr->bm_keys[index]; + + xmove(right_hdr->bm_keys, + left_hdr->bm_keys + left_hdr->bm_nkeys + non_leaf, + (space - non_leaf) * sizeof(btree_key)); + + parent_hdr->bm_keys[index] = right_hdr->bm_keys[space - non_leaf]; + + xmove(right_hdr->bm_keys + space, + right_hdr->bm_keys, + (right_hdr->bm_nkeys - space) * sizeof(btree_key)); + + /* + * Move the data + * + * i) inter-node shift data right to left + * ii) intra-node shift remaining data left + * + */ + + if (non_leaf) { + + xmove(right_table.bu_children, + left_table.bu_children + left_hdr->bm_nkeys + 1, + space * sizeof(vbn_t)); + + xmove(right_table.bu_children + space, + right_table.bu_children, + (right_hdr->bm_nkeys + 1 - space) * sizeof(vbn_t)); + } else { + + xmove(right_table.bu_data, + left_table.bu_data + left_hdr->bm_nkeys, + space * sizeof(TREE_TYPE)); + + xmove(right_table.bu_data + space, + right_table.bu_data, + (right_hdr->bm_nkeys - space) * sizeof(TREE_TYPE)); + + } + + left_hdr->bm_nkeys += space; + right_hdr->bm_nkeys -= space; + + bdwrite(right_bp); + bdwrite(left_bp); + bdwrite(parent_bp); + + return 0; +} + +int +btree_shift_right(gbuf_t *left_bp, btree_lookup_args *cookie) +{ + gbuf_t *parent_bp; + gbuf_t *right_bp; + btree_meta *left_hdr, *right_hdr, *parent_hdr; + btree_data left_table, right_table, parent_table; + int space; + int level; + int index; + int non_leaf=1; + + (void)parent_table; + INIT_BTREE_NODE(left_bp, left_hdr, left_table); + ASSERT(left_hdr->bm_magic == BTREE_HDR_MAGIC); + + left_hdr->bm_lop = BPUSHRIGHT; + + level = left_hdr->bm_level + 1; + + parent_bp = bread(cookie->bt_vp, cookie->bt_path[level].pt_bno); + INIT_BTREE_NODE(parent_bp, parent_hdr, parent_table); + ASSERT(parent_hdr->bm_magic == BTREE_HDR_MAGIC); + + index = cookie->bt_path[level].pt_index; + + if (index == parent_hdr->bm_nkeys) { + + brelse(parent_bp); + return -1; /* different parents */ + } + + parent_hdr->bm_serialno = serialno; + parent_hdr->bm_lop = BRPIVOT; + + /* + * This index is relative to children; Needs to be translated + * to the key name space. + * + */ + + right_bp = bread(cookie->bt_vp, left_hdr->bm_next); + INIT_BTREE_NODE(right_bp, right_hdr, right_table); + ASSERT(right_hdr->bm_magic == BTREE_HDR_MAGIC); + right_hdr->bm_lop = BPULLRIGHT; + right_hdr->bm_serialno = serialno; + + space = (left_hdr->bm_nkeys - right_hdr->bm_nkeys) >> 1; + + if (left_hdr->bm_level == 0) non_leaf = 0; + + /* + * shift the keys over. + * + * i) intra-shift to make space for keys coming from the left + * ii) demote key from parent + * iii) inter-shift of keys from left, right node now full + * iv) promote keys from left in to the ancestors + * + */ + xmove(right_hdr->bm_keys, right_hdr->bm_keys + space, + right_hdr->bm_nkeys * sizeof(btree_key)); + + if (non_leaf) + right_hdr->bm_keys[space - 1] = parent_hdr->bm_keys[index]; + + xmove(left_hdr->bm_keys + left_hdr->bm_nkeys - space + non_leaf, + right_hdr->bm_keys, + (space - non_leaf) * sizeof(btree_key)); + + if (non_leaf) parent_hdr->bm_keys[index] = + left_hdr->bm_keys[left_hdr->bm_nkeys - space]; + else parent_hdr->bm_keys[index] = right_hdr->bm_keys[0]; + + /* + * Shift children/payload over. Parent is not touched here. + * + * i) Make room in the right + * ii) shift from left + * + */ + + if (non_leaf) { + + xmove(right_table.bu_children, + right_table.bu_children + space, + (right_hdr->bm_nkeys + 1) * sizeof(vbn_t)); + xmove(left_table.bu_children + left_hdr->bm_nkeys + 1 - space, + right_table.bu_children, + space * sizeof(vbn_t)); + } else { + xmove(right_table.bu_data, + right_table.bu_data + space, + right_hdr->bm_nkeys * sizeof(TREE_TYPE)); + xmove(left_table.bu_data + left_hdr->bm_nkeys - space, + right_table.bu_data, + space * sizeof(TREE_TYPE)); + } + + right_hdr->bm_nkeys += space; + left_hdr->bm_nkeys -= space; + + bdwrite(right_bp); + bdwrite(left_bp); + bdwrite(parent_bp); + + return 0; +} + +/* + * Crappy little binary search. All keys to the left of a key + * are greater than it, that is, key[i] < key[i+1] for all i + * + * If key is not found then returns where it would insert + * the key. This is IMPORTANT and callers rely on this + * property. + * + */ + +int +btree_intra_lookup(gbuf_t *bp, btree_key key) +{ + btree_meta *b_hdr; + btree_data b_table; + btree_key *table; + int right, left; + int cursor; + + (void)b_table; + INIT_BTREE_NODE(bp, b_hdr, b_table); + + table = b_hdr->bm_keys; + + right = b_hdr->bm_nkeys - 1; + left = 0; + while (1) { + + if (left > right) { + + if (right >= 0 && table[right] > key) + cursor = right; + else + cursor = left; + + break; /* while(1) search loop */ + } + + cursor = (right + left) >> 1; + + if (key == table[cursor]) { + + if (b_hdr->bm_level) cursor++; + + break; /* while(1) search loop */ + } + + if (key < table[cursor]) + right = cursor - 1; + else + left = cursor + 1; + } + + ASSERT(cursor >= 0 && cursor <= b_hdr->bm_nkeys); + ASSERT((cursor == b_hdr->bm_nkeys) || (table[cursor] >= key)); + + return cursor; +} + +int btree_create_btree(char *path, gfile_t **btree) +{ + gfile_t *vp; + btree_meta *b_hdr; + btree_data b_table; + gbuf_t *bp; + vbn_t blkno; + + (void)b_table; + unlink (path); + + vp = util_load_vp (path); + if (vp == NULL) + return -errno; + + blkno = btree_grow (vp); + assert (blkno == BTREE_ROOT_FBN); + + TREE_LOCK(vp); + *btree = vp; + + bp = bread (*btree, BTREE_ROOT_FBN); + memset (bp->b_data, 0, TREE_BSIZE); + INIT_BTREE_NODE(bp, b_hdr, b_table); + + b_hdr->bm_next = b_hdr->bm_prev = NODE_ANCHOR; + b_hdr->bm_magic = BTREE_HDR_MAGIC; + + bdwrite(bp); + + TREE_UNLOCK(vp); + + return 0; +} + +#if 0 +static int population; + +void +btree_paranoid(gfile_t *vp, vbn_t fbn, btree_key upper_bound) +{ + extern int insertions, deletions; + extern btree_key keys[]; + btree_key last_key, cursor; + gbuf_t *bp; + btree_meta *b_hdr; + btree_data b_table; + int i; + + if (fbn == BTREE_ROOT_FBN) { + + population = 0; + } + + bp = bread(vp, fbn); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + ASSERT(b_hdr->bm_fbn == fbn); + + ASSERT(b_hdr->bm_keys[0] < upper_bound); + ASSERT(b_hdr->bm_keys[b_hdr->bm_nkeys - 1] < upper_bound); + + if (b_hdr->bm_level == 0) + population += b_hdr->bm_nkeys; + + for (i = 1; i < b_hdr->bm_nkeys - 1; i++) { + + ASSERT(b_hdr->bm_keys[i-1] < b_hdr->bm_keys[i]); + } + + for (i=0; i <= b_hdr->bm_nkeys; i++) { + + if (b_hdr->bm_level == 0) { + + if (i == b_hdr->bm_nkeys) continue; + + ASSERT(b_hdr->bm_keys[i] == b_table.bu_data[i].key); + ASSERT(keys[b_hdr->bm_keys[i]] == b_hdr->bm_keys[i]); + ASSERT(atoi(b_table.bu_data[i].name) == + b_hdr->bm_keys[i]); + + if (i == 0) last_key = b_hdr->bm_keys[0]; + else { + + for (cursor = last_key + 1; + cursor < b_hdr->bm_keys[i]; + cursor++) { + if (keys[cursor] != 0LL) + printf("\n*\t%d\n", + (int) keys[cursor]); + } + last_key = b_hdr->bm_keys[i]; + } + } else + btree_paranoid(vp, b_table.bu_children[i], + (i == b_hdr->bm_nkeys ? + BTREE_KEY_MAX : + b_hdr->bm_keys[i])); + } + + brelse(bp); + + if (fbn == BTREE_ROOT_FBN) + ASSERT(population == insertions - deletions); +} +#endif + +#ifdef PARANOID +btree_key dkey=0LL; +int32_t dcursor=-1; +vbn_t fbn=0; +uint32_t dserialno; +#endif + +void +btree_delete_modify(btree_lookup_args *cookie, int level) +{ + gbuf_t *bp; + btree_meta *b_hdr; + btree_data b_table; +#if 0 + gbuf_t *right_bp=0, *left_bp=0; + btree_meta *right_hdr=0; + btree_meta *left_hdr=0; + TREE_TYPE *right_table; + TREE_TYPE *left_table; + int rc; +#endif + int cursor; + + bp = bread(cookie->bt_vp, cookie->bt_path[level].pt_bno); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + + cursor = cookie->bt_path[level].pt_index; + +#ifdef PARANOID + ASSERT(level == 0); + ASSERT(b_hdr->bm_keys[cursor] == cookie->bt_key); + ASSERT(b_table[cursor] == (TREE_TYPE) cookie->bt_key); + + dkey = cookie->bt_key; + dcursor = cursor; + fbn = bp->b_blkno; + dserialno = serialno; +#endif + + if (level > 0) { + + if (cursor > 0) + xmove(b_hdr->bm_keys + cursor, + b_hdr->bm_keys + cursor - 1, + (b_hdr->bm_nkeys - cursor) * sizeof(btree_key)); + else + xmove(b_hdr->bm_keys + 1, + b_hdr->bm_keys, + b_hdr->bm_nkeys * sizeof(btree_key)); + + xmove(b_table.bu_children + cursor + 1, + b_table.bu_children + cursor, + (b_hdr->bm_nkeys - cursor) * sizeof(vbn_t)); + } else if (cursor < b_hdr->bm_nkeys - 1) { + + xmove(b_hdr->bm_keys + cursor + 1, b_hdr->bm_keys + cursor, + (b_hdr->bm_nkeys - cursor) * sizeof(btree_key)); + + xmove(b_table.bu_data + cursor + 1, + b_table.bu_data + cursor, + (b_hdr->bm_nkeys - cursor) * sizeof(TREE_TYPE)); + } + + b_hdr->bm_nkeys--; + b_hdr->bm_serialno = serialno; + b_hdr->bm_lop = BDELETE; + + bdwrite(bp); + + return; + +#if 0 + /* + * If we are the root of the tree then there is nothing + * that we can do so bail early. + * + */ + if (bp->b_blkno == BTREE_ROOT_FBN) + return; + + /* + * Determine if an underflow has occured. + * + */ + if (b_hdr->bm_nkeys > (BTREE_N >> 1)) + return; + + /* + * An underflow will occur. We now consult our neighbours + * and determine upon a choice of action from the following + * selection. Excepting the case where we are the last node + * left one of the following is guaranteed to be possible. + * + * 1) Shift in to a neighbour + * + * 2) Accept keys from a neighbour + * + */ + if (b_hdr->bm_next != NODE_ANCHOR) { + + right_bp = bread(cookie->bt_vp, b_hdr->bm_next); + INIT_BTREE_NODE(right_bp, right_hdr, right_table); + ASSERT(right_hdr->bm_magic == BTREE_HDR_MAGIC); + + if (right_hdr->bm_nkeys + b_hdr->bm_nkeys < BTREE_N) { + + /* + * We have room in our neighbour so we will shift and + * delete the node. + * + */ + brelse(right_bp); + rc = btree_shift_right(bp, cookie); + if (rc == 0) { + + btree_delete_modify(cookie, level + 1); + btree_collapse_node(bp); + return; + } + + } else brelse(right_bp); + } + + if (b_hdr->bm_prev != NODE_ANCHOR) { + + left_bp = bread(cookie->bt_vp, b_hdr->bm_prev); + INIT_BTREE_NODE(left_bp, left_hdr, left_table); + ASSERT(left_hdr->bm_magic == BTREE_HDR_MAGIC); + + /* + * We have room in our neighbour so we will shift and + * delete. + * + */ + if (left_hdr->bm_nkeys + b_hdr->bm_nkeys < BTREE_N) { + + brelse(left_bp); + rc = btree_shift_left(bp, cookie); + if (rc == 0) { + + btree_delete_modify(cookie, level + 1); + btree_collapse_node(bp); + return; + } + + } else brelse(left_bp); + } +#endif /* if 0 */ +} + +#if 0 +void +btree_collapse_node(gbuf_t *bp) +{ + extern int nleaves; + gbuf_t *right_bp=0, *left_bp=0; + btree_meta *b_hdr; + btree_meta *right_hdr=0; + btree_meta *left_hdr=0; + TREE_TYPE *b_table; + TREE_TYPE *right_table; + TREE_TYPE *left_table; + + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + + if (b_hdr->bm_next != NODE_ANCHOR) { + + right_bp = bread(bp->b_vp, b_hdr->bm_next); + INIT_BTREE_NODE(right_bp, right_hdr, right_table); + ASSERT(right_hdr->bm_magic == BTREE_HDR_MAGIC); + + right_hdr->bm_prev = b_hdr->bm_prev; + bdwrite(right_bp); + } + + if (b_hdr->bm_prev != NODE_ANCHOR) { + + left_bp = bread(bp->b_vp, b_hdr->bm_prev); + INIT_BTREE_NODE(left_bp, left_hdr, left_table); + ASSERT(left_hdr->bm_magic == BTREE_HDR_MAGIC); + + left_hdr->bm_next = b_hdr->bm_next; + bdwrite(left_bp); + } + + nleaves--; + spensa_punch_hole(bp->b_vp, (vbn_t) bp->b_blkno); +} +#endif + +vbn_t btree_grow(gfile_t *vp) +{ + vbn_t blkno; + + blkno = alloc_block (vp); + + return blkno; +} + +#ifdef BTREE_OVERWRITE +int btree_overwrite(btree_lookup_args *cookie) +{ + btree_meta *b_hdr; + btree_data b_table; + + INIT_BTREE_NODE(cookie->bt_bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + + b_hdr->bm_keys[cookie->bt_index] = cookie->bt_key; + b_table.bu_data[cookie->bt_index] = cookie->bt_data; + + bdwrite(cookie->bt_bp); + + return 0; +} +#endif + +#ifdef BTREE_ITER + +static int tblkno; +static int tcurrent_index; + +void btree_traverse_reset (gfile_t *vp) +{ + btree_meta *b_hdr; + btree_data b_table; + gbuf_t *bp; + + tblkno = BTREE_ROOT_FBN; + + while (1) { + + bp = bread (vp, tblkno); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + ASSERT(b_hdr->bm_fbn == tblkno); + + if (b_hdr->bm_level == 0) break; + + tblkno = b_table.bu_children[0]; + brelse (bp); + } + + brelse (bp); + tcurrent_index = 0; + + return; +} + +TREE_TYPE *btree_traverse_next (gfile_t *vp) +{ + btree_data b_table; + btree_meta *b_hdr; + TREE_TYPE *datap; + gbuf_t *bp; + int try_again; + +retry: + try_again=0; + + if (tblkno == NODE_ANCHOR) return 0; + + bp = bread (vp, tblkno); + INIT_BTREE_NODE(bp, b_hdr, b_table); + ASSERT(b_hdr->bm_magic == BTREE_HDR_MAGIC); + ASSERT(b_hdr->bm_fbn == tblkno); + + if (b_hdr->bm_nkeys == 0) { + + try_again = 1; + goto empty; + } + + datap = &b_table.bu_data[tcurrent_index]; + + tcurrent_index++; + if (tcurrent_index == b_hdr->bm_nkeys) { + +empty: + tblkno = b_hdr->bm_next; + tcurrent_index = 0; + } + + brelse (bp); + + if (try_again) + goto retry; + + return datap; +} + +#endif /* BTREE_ITER */ + +int +btree_lookup (gfile_t *vp, btree_key key, TREE_TYPE *datum) +{ + btree_lookup_args cookie; + int rc; + + cookie.bt_vp = vp; + cookie.bt_key = key; + cookie.bt_intent = BTREE_LFLAG_LOOKUP | BTREE_LFLAG_FAST; + + rc = btree_entry (&cookie); + if (rc == BTREE_RC_NOTFOUND) + return ENOENT; + + *datum = *cookie.bt_datap; + return 0; +} + +int +btree_insert (gfile_t *vp, btree_key key, TREE_TYPE datum) +{ + btree_lookup_args cookie; + int rc; + + cookie.bt_vp = vp; + cookie.bt_key = key; + cookie.bt_data = datum; + cookie.bt_intent = BTREE_LFLAG_INSERT | BTREE_LFLAG_UPDATE; + + rc = btree_entry (&cookie); + if (rc == BTREE_RC_FOUND) + return EEXIST; + + if (rc == BTREE_RC_INSERTED || rc == BTREE_RC_DONE) + return 0; + + assert (0); + + return 0; +} diff --git a/apps/phttpd/bplus_support.c b/apps/phttpd/bplus_support.c new file mode 100644 index 000000000..04736bf61 --- /dev/null +++ b/apps/phttpd/bplus_support.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#ifdef WITH_CLFLUSHOPT +#define _mm_clflush(p) _mm_clflushopt(p) +#endif + +#define MAX_BUFFERS 64 +gbuf_t buffer_table[MAX_BUFFERS]; + +gfile_t * +util_load_vp (const char *path) +{ + gfile_t *vp; + int error; + struct stat stx; + int rc; + int fd; + + fd = open (path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (fd < 0) + return NULL; + + rc = fstat (fd, &stx); + if (rc) + { + close (fd); + return NULL; + } + + vp = (gfile_t *) malloc (sizeof (gfile_t)); + memset (vp, 0, sizeof (gfile_t)); + vp->v_fd = fd; + vp->v_used = vp->v_size = stx.st_size; + + if (stx.st_size == 0) + { + /* + * We assume we're creating a new B+ tree. + * + */ + + rc = ftruncate (vp->v_fd, (off_t) TREE_GROW_SIZE); + if (rc) + { + close (vp->v_fd); + free (vp); + return NULL; + } + + rc = fstat (vp->v_fd, &stx); + if (rc) + { + close (vp->v_fd); + free (vp); + return NULL; + } + + vp->v_size = stx.st_size; + vp->v_used = 0ll; + } + + error = map (vp); + if (error) + { + close (vp->v_fd); + free (vp); + return NULL; + } + + return vp; +} + +void +util_unload_vp (gfile_t *vp) +{ + if (ftruncate (vp->v_fd, vp->v_used) < 0) + perror("ftruncate"); + + munmap (vp->v_base, vp->v_size); + close (vp->v_fd); + free (vp); +} + +int +map (gfile_t *vp) +{ + vp->v_base = (caddr_t) mmap ( + vp->v_base, + vp->v_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FILE, + vp->v_fd, + 0); + + if (vp->v_base == (caddr_t) -1) + return errno; + + return 0; +} + +vbn_t +alloc_block (gfile_t *vp) +{ + vbn_t blkno = 0; + int error; + int rc; + int i = 0; + + blkno = TREE_O2B (vp->v_used); + vp->v_used += TREE_BSIZE; + + if (vp->v_used < vp->v_size) + return blkno; + + caddr_t old_base = vp->v_base; + munmap (vp->v_base, vp->v_size); + vp->v_size <<= 1; + + rc = ftruncate (vp->v_fd, vp->v_size); + if (rc) + return -errno; + + error = map (vp); + if (error) + return -errno; + + if (old_base == NULL || old_base == vp->v_base) + return blkno; + + /* + * The OS has changed our mapping. Move the buffer pointers. + * + */ + for (i = 0; i < MAX_BUFFERS; ++i) + if (buffer_table[i].b_flags & B_VALID) + buffer_table[i].b_data = vp->v_base + + TREE_B2O (buffer_table[i].b_blkno); + + return blkno; +} + +gbuf_t * +bread (gfile_t *vp, vbn_t blkno) +{ + gbuf_t *bp; + size_t off; + + bp = &buffer_table[vp->v_bufIDX & (MAX_BUFFERS - 1)]; + assert ((bp->b_flags & B_INUSE) == 0); + ++vp->v_bufIDX; + + off = TREE_B2O (blkno); + + if (off >= vp->v_size) + return NULL; + + bp->b_vp = vp; + bp->b_blkno = blkno; + bp->b_flags = B_INUSE | B_VALID; + bp->b_data = vp->v_base + off; + + return bp; +} + +void +brelse (gbuf_t *bp) +{ + bp->b_flags &= ~B_INUSE; +} + +void +bdwrite (gbuf_t *bp) +{ + /* + * Nothing to do except release buffer. bp->b_data points at + * mmaped pages, and we assume a DIO file system for NVM. + * + */ + + int i; + for (i = 0; i < 4096; i+=64) { + _mm_clflush(bp->b_data+i); + } + brelse (bp); +} diff --git a/apps/phttpd/bplus_support.h b/apps/phttpd/bplus_support.h new file mode 100644 index 000000000..e88441f29 --- /dev/null +++ b/apps/phttpd/bplus_support.h @@ -0,0 +1,47 @@ +#ifndef __PASTE_BPLUSSUPPORT__H__ +#define __PASTE_BPLUSSUPPORT__H__ + +#include +#include + +#define TREE_BSIZE 4096 +#define TREE_BSHIFT 12 +#define TREE_O2B(X) (X ? ((X) >> 12) : 0) +#define TREE_B2O(X) ((X) << 12) +#define TREE_GROW_SIZE (64 * TREE_BSIZE) + +typedef int32_t vbn_t; + +typedef struct { + + int v_fd; + size_t v_size; + size_t v_used; + caddr_t v_base; + int v_bufIDX; + +} gfile_t; + +#define B_VALID 0x01 +#define B_INUSE 0x02 +#define B_DIRTY 0x04 +#define B_ERROR 0x08 + +typedef struct { + + gfile_t *b_vp; + vbn_t b_blkno; + int b_flags; + caddr_t b_data; + +} gbuf_t; + +gfile_t *util_load_vp (const char *path); +void util_unload_vp (gfile_t *);\ +int map (gfile_t *); +vbn_t alloc_block (gfile_t *); +gbuf_t *bread (gfile_t *, vbn_t); +void brelse (gbuf_t *); +void bdwrite (gbuf_t *); + +#endif diff --git a/apps/phttpd/nmlib.h b/apps/phttpd/nmlib.h new file mode 100644 index 000000000..5c7d4caa7 --- /dev/null +++ b/apps/phttpd/nmlib.h @@ -0,0 +1,1412 @@ +#ifndef _NMLIB_H_ +#define _NMLIB_H_ +#include +#ifdef __FreeBSD__ +#include +#include /* pthread w/ affinity */ +#include /* sysctl */ +#endif +#include +#include +#include +#include +#include +#include +#include /* SOL_TCP */ +#include +#ifdef __linux__ +#include /* sysctl */ +#include /* SOL_TCP */ +#include +#include +#endif /* __linux__ */ + +#ifdef __cplusplus +extern "C" { +#include +} +#else +#include +#endif /* __cplusplus */ + +#ifndef D +#define D(fmt, ...) \ + printf(""fmt"\n", ##__VA_ARGS__) +#endif + +int normalize = 1; + +#define EPOLLEVENTS 2048 +#define DEBUG_SOCKET 1 +#ifndef linux +#define SOL_TCP SOL_SOCKET +#define fallocate(a, b, c, d) posix_fallocate(a, c, d) +#endif + +enum dev_type { DEV_NONE, DEV_NETMAP, DEV_SOCKET }; +enum { TD_TYPE_SENDER = 1, TD_TYPE_RECEIVER, TD_TYPE_OTHER, TD_TYPE_DUMMY }; + +#ifdef linux +#define cpuset_t cpu_set_t +#endif +/* set the thread affinity. */ +static inline int +setaffinity(pthread_t me, int i) +{ + cpuset_t cpumask; + + if (i == -1) + return 0; + + /* Set thread affinity affinity.*/ + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { + D("Unable to set affinity: %s", strerror(errno)); + return 1; + } + return 0; +} + +static void +tx_output(struct my_ctrs *cur, double delta, const char *msg) +{ + double bw, raw_bw, pps, abs; + char b1[40], b2[80], b3[80]; + u_int size; + + if (cur->pkts == 0) { + printf("%s nothing.\n", msg); + return; + } + + size = (cur->bytes / cur->pkts); + + printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n", + msg, + (unsigned long long)cur->pkts, + (unsigned long long)cur->bytes, + (unsigned long long)cur->events, size, delta); + if (delta == 0) + delta = 1e-6; + if (size < 60) /* correct for min packet size */ + size = 60; + pps = cur->pkts / delta; + bw = (8.0 * cur->bytes) / delta; + /* raw packets have4 bytes crc + 20 bytes framing */ + raw_bw = (8.0 * (cur->pkts * 24 + cur->bytes)) / delta; + abs = cur->pkts / (double)(cur->events); + + printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", + norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs); +} + +struct nm_msg { + struct netmap_ring *rxring; + struct netmap_ring *txring; + struct netmap_slot *slot; + struct nm_targ *targ; + int fd; +}; + +struct nm_garg { + char ifname[NETMAP_REQ_IFNAMSIZ*2]; // must be here + struct nmport_d *nmd; + void *(*td_body)(void *); + int nthreads; + int affinity; + int dev_type; + int td_type; + int main_fd; + int system_cpus; + int cpus; + uint32_t extra_bufs; /* goes in nr_arg3 */ + uint64_t extmem_siz; + u_int ring_objsize; + int extra_pipes; /* goes in nr_arg1 */ + char *nmr_config; + char *extmem; /* goes to nr_arg1+ */ +#define STATS_WIN 15 + int win_idx; + int64_t win[STATS_WIN]; + int wait_link; + int polltimeo; + int pollevents; +#ifdef __FreeBSD__ + struct timespec *polltimeo_ts; +#endif + int verbose; + int report_interval; +#define OPT_PPS_STATS 2048 + int options; + int targ_opaque_len; // passed down to targ + + struct nmreq_header nm_hdr; // cache decoded + int (*data)(struct nm_msg *); + void (*connection)(struct nm_msg *); + int (*read)(struct nm_msg *); + int (*thread)(struct nm_targ *); + int (*writable)(struct nm_msg *); + int *fds; + u_int fdnum; + int *cfds; + u_int cfdnum; + int emu_delay; + void *garg_private; + char ifname2[NETMAP_REQ_IFNAMSIZ]; +}; + +struct nm_targ { + struct nm_garg *g; + struct nmport_d *nmd; + /* these ought to be volatile, but they are + * only sampled and errors should not accumulate + */ + struct my_ctrs ctr; + + struct timespec tic, toc; + int used; + int completed; + int cancel; + int fd; + int me; + int affinity; + pthread_t thread; +#ifdef NMLIB_EXTRA_SLOT + struct netmap_slot *extra; +#else + uint32_t *extra; +#endif + uint32_t extra_cur; + uint32_t extra_num; + int *fdtable; + int fdtable_siz; +#ifdef linux + struct epoll_event evts[EPOLLEVENTS]; +#else + struct kevent evts[EPOLLEVENTS]; +#endif /* linux */ + void *opaque; +}; + +static inline void +nm_update_ctr(struct nm_targ *targ, int npkts, int nbytes) +{ + targ->ctr.pkts += npkts; + targ->ctr.bytes += nbytes; +} + +static struct nm_targ *targs; +static int global_nthreads; + +/* control-C handler */ +static void +sigint_h(int sig) +{ + int i; + + (void)sig; /* UNUSED */ + D("received control-C on thread %p", (void *)pthread_self()); + for (i = 0; i < global_nthreads; i++) { + D("canceling targs[i] %p", &targs[i]); + targs[i].cancel = 1; + } +} + + +/* sysctl wrapper to return the number of active CPUs */ +static int +system_ncpus(void) +{ + int ncpus; +#if defined (__FreeBSD__) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(mib); + sysctl(mib, 2, &ncpus, &len, NULL, 0); +#elif defined(linux) + ncpus = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_WIN32) + { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + ncpus = sysinfo.dwNumberOfProcessors; + } +#else /* others */ + ncpus = 1; +#endif /* others */ + return (ncpus); +} + +static void * +nm_thread(void *data) +{ + struct nm_targ *targ = (struct nm_targ *) data; + struct nm_garg *g = targ->g; + + D("start, fd %d main_fd %d affinity %d", + targ->fd, targ->g->main_fd, targ->affinity); + if (setaffinity(targ->thread, targ->affinity)) + goto quit; + g->td_body(data); + +quit: + targ->used = 0; + return (NULL); +} + +static int +nm_start_threads(struct nm_garg *g) +{ + int i; + struct nm_targ *t; + + targs = (struct nm_targ *)calloc(g->nthreads, sizeof(*targs)); + if (!targs) { + return -ENOMEM; + } + for (i = 0; i < g->nthreads; i++) { + t = &targs[i]; + + bzero(t, sizeof(*t)); + t->fd = -1; + t->g = g; + t->opaque = calloc(1, g->targ_opaque_len); + if (t->opaque == NULL) { + continue; + } + +#ifndef NOLIBNETMAP + if (g->dev_type == DEV_NETMAP) { + t->nmd = nmport_clone(g->nmd); + if (i > 0) { + /* register one NIC only */ + char name[NETMAP_REQ_IFNAMSIZ], *suff; + size_t nl = strlen(t->nmd->hdr.nr_name); + + if (asprintf(&suff, "-%d", i) < 0) { + perror("asprintf"); + continue; + } + if (sizeof(name) < nl + strlen(suff) + 1) { + D("no space %s", t->nmd->hdr.nr_name); + continue; + } + /* let nmport_parse() handle errors */ + strlcpy((char *)mempcpy(name, t->nmd->hdr.nr_name, nl), + suff, sizeof(name) - nl); + free(suff); + if (nmport_parse(t->nmd, name)) { + D("failed in nmport_parse %s", name); + continue; + } + if (nmport_open_desc(t->nmd)) { + D("Unable to open %s: %s", t->g->ifname, + strerror(errno)); + continue; + } + D("thread %d %u extra bufs at %u", i, + t->nmd->reg.nr_extra_bufs, + t->nmd->nifp->ni_bufs_head); + } else { + t->nmd = g->nmd; + } + t->fd = t->nmd->fd; + + } +#endif /* NOLIBNETMAP */ + t->used = 1; + t->me = i; + if (g->affinity >= 0) { + t->affinity = (g->affinity + i) % g->system_cpus; + } else { + t->affinity = -1; + } + } + /* Wait for PHY reset. */ + D("Wait %d secs for phy reset", g->wait_link); + sleep(g->wait_link); + D("Ready..."); + + D("nthreads %d", g->nthreads); + for (i = 0; i < g->nthreads; i++) { + t = &targs[i]; + if (pthread_create(&t->thread, NULL, &nm_thread, t) == -1) { + D("Unable to create thread %d: %s", i, strerror(errno)); + t->used = 0; + } + } + return 0; +} + +static void +nm_main_thread(struct nm_garg *g) +{ + int i; + + struct my_ctrs prev, cur; + double delta_t; + struct timeval tic, toc; + + prev.pkts = prev.bytes = prev.events = 0; + gettimeofday(&prev.t, NULL); + for (;;) { + char b1[40], b2[40], b3[40], b4[100]; + uint64_t pps, usec; + struct my_ctrs x; + double abs; + int done = 0; + + usec = wait_for_next_report(&prev.t, &cur.t, + g->report_interval); + + cur.pkts = cur.bytes = cur.events = 0; + cur.min_space = 0; + if (usec < 10000) /* too short to be meaningful */ + continue; + /* accumulate counts for all threads */ + for (i = 0; i < g->nthreads; i++) { + cur.pkts += targs[i].ctr.pkts; + cur.bytes += targs[i].ctr.bytes; + cur.events += targs[i].ctr.events; + cur.min_space += targs[i].ctr.min_space; + targs[i].ctr.min_space = 99999; + if (targs[i].used == 0) { + done++; + } + } + x.pkts = cur.pkts - prev.pkts; + x.bytes = cur.bytes - prev.bytes; + x.events = cur.events - prev.events; + pps = (x.pkts*1000000 + usec/2) / usec; + abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0; + + if (!(g->options & OPT_PPS_STATS)) { + strcpy(b4, ""); + } else { + /* Compute some pps stats using a sliding window. */ + double ppsavg = 0.0, ppsdev = 0.0; + int nsamples = 0; + + g->win[g->win_idx] = pps; + g->win_idx = (g->win_idx + 1) % STATS_WIN; + + for (i = 0; i < STATS_WIN; i++) { + ppsavg += g->win[i]; + if (g->win[i]) { + nsamples ++; + } + } + ppsavg /= nsamples; + + for (i = 0; i < STATS_WIN; i++) { + if (g->win[i] == 0) { + continue; + } + ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg); + } + ppsdev /= nsamples; + ppsdev = sqrt(ppsdev); + + snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", + norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize)); + } + + D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", + norm(b1, pps, normalize), b4, + norm(b2, (double)x.pkts, normalize), + norm(b3, (double)x.bytes*8, normalize), + (unsigned long long)usec, + abs, (int)cur.min_space); + prev = cur; + + if (done == g->nthreads) + break; + } + + timerclear(&tic); + timerclear(&toc); + cur.pkts = cur.bytes = cur.events = 0; + /* final round */ + for (i = 0; i < g->nthreads; i++) { + struct timespec t_tic, t_toc; + /* + * Join active threads, unregister interfaces and close + * file descriptors. + */ + if (targs[i].used) + pthread_join(targs[i].thread, NULL); /* blocking */ + if (g->dev_type == DEV_NETMAP) { +#ifndef NOLIBNETMAP + nmport_close(targs[i].nmd); +#endif /* !NOLIBNETMAP */ + targs[i].nmd = NULL; + } else if (targs[i].fd > 2) { + close(targs[i].fd); + } + if (targs[i].completed == 0) + D("ouch, thread %d exited with error", i); + /* + * Collect threads output and extract information about + * how long it took to send all the packets. + */ + cur.pkts += targs[i].ctr.pkts; + cur.bytes += targs[i].ctr.bytes; + cur.events += targs[i].ctr.events; + /* collect the largest start (tic) and end (toc) times, + * XXX maybe we should do the earliest tic, or do a weighted + * average ? + */ + t_tic = timeval2spec(&tic); + t_toc = timeval2spec(&toc); + if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) + tic = timespec2val(&targs[i].tic); + if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc)) + toc = timespec2val(&targs[i].toc); + + } + /* print output. */ + timersub(&toc, &tic, &toc); + delta_t = toc.tv_sec + 1e-6* toc.tv_usec; + if (g->td_type == TD_TYPE_SENDER) + tx_output(&cur, delta_t, "Sent"); + else if (g->td_type == TD_TYPE_RECEIVER) + tx_output(&cur, delta_t, "Received"); +} + +#define IF_OBJTOTAL 100 +#define RING_OBJSIZE 33024 +#define RING_OBJTOTAL (IF_OBJTOTAL * 4) + +static int +nm_start(struct nm_garg *g) +{ + int i, devqueues = 0; + struct sigaction sa; + sigset_t ss; +#ifndef NOLIBNETMAP + int error; +#endif + + g->main_fd = -1; + g->wait_link = 3; + g->report_interval = 2000; + g->cpus = g->system_cpus = i = system_ncpus(); + if (g->nthreads == 0) + g->nthreads = 1; + if (g->cpus < 0 || g->cpus > i) { + D("%d cpus is too high, have only %d cpus", g->cpus, i); + return -EINVAL; + } + D("running on %d cpus (have %d)", g->cpus, i); + if (g->cpus == 0) + g->cpus = i; + + if (g->dev_type != DEV_NETMAP) + goto nonetmap; + +#ifndef NOLIBNETMAP + if (g->nthreads > 1) { + /* register only one ring */ + if (strlen(g->ifname) + 2 > sizeof(g->ifname) - 1) { + D("no space in g->ifname"); + return -EINVAL; + } + strlcat(g->ifname, "-0", sizeof(g->ifname)); + } + + if (g->nthreads > 1) { + char conf[32]; + /* create multiple rings */ + snprintf(conf, sizeof(conf), "@conf:rings=%d", g->nthreads); + strlcat(g->ifname, conf, sizeof(g->ifname)); + } + if (g->extmem) { + int i; + size_t need_rings, need_rings_space, need_ifs, need_ifs_space, + buf_space, need_rings_bufs, buf_avail; + char extm[128], kv[32]; + char *prms[4] = {(char *)",if-num=%u", + (char *)",ring-num=%u", (char *)",ring-size=%u", + (char *)",buf-num=%u"}; + u_int32_t prmvals[4]; + + //= {IF_OBJTOTAL, RING_OBJTOTAL, + // RING_OBJSIZE, (uint32_t)g->extra_bufs + 320000}; + need_rings = 12 * g->nthreads; + need_rings_space = (g->ring_objsize+64) * need_rings; + need_rings_bufs = (g->ring_objsize+64)/sizeof(struct netmap_slot); + need_ifs = g->nthreads * 2; + need_ifs_space = (sizeof(struct netmap_if)+64) * need_ifs; + buf_space = g->extmem_siz - need_rings_space - need_ifs_space; + buf_avail = buf_space / 2048; + buf_avail = (buf_avail/10) * 10; + if (buf_avail < need_rings * need_rings_bufs) { + D("only %lu bufs available", buf_avail); + return -EINVAL; + } + g->extra_bufs = buf_avail - need_rings * need_rings_bufs; + D("extmem_siz %lu need_ifs %lu need_rings %lu buf_avail %lu " + "need_ring_bufs %lu extra_bufs %u", + g->extmem_siz, + need_ifs, need_rings, buf_avail, + need_rings_bufs, + g->extra_bufs + ); + prmvals[0] = need_ifs; + prmvals[1] = need_rings; + prmvals[2] = g->ring_objsize; + prmvals[3] = buf_avail; + + snprintf(extm, sizeof(extm), "@extmem:file=%s", g->extmem); + for (i = 0; i < 4; i++) { + snprintf(kv, sizeof(kv), prms[i], prmvals[i]); + if (strlcat(extm, kv, sizeof(extm)) >= sizeof(extm)) { + D("no space for %s", kv); + return -EINVAL; + } + } + if (strlcat(g->ifname, extm, sizeof(g->ifname)) >= + sizeof(g->ifname)) { + D("no space for %s", extm); + return -EINVAL; + } + } + /* internally nmport_parse() */ + D("now nmport_open %s", g->ifname); + //g->nmd = nmport_open(g->ifname); + if (nmport_enable_option("offset")) + goto nonetmap; + g->nmd = nmport_prepare(g->ifname); + if (g->nmd == NULL) { + D("Unable to prepare %s: %s", g->ifname, strerror(errno)); + goto nonetmap; + } + if (g->extra_bufs) { + g->nmd->reg.nr_extra_bufs = g->extra_bufs / g->nthreads; + } + error = nmport_open_desc(g->nmd); + if (error) { + D("Unable to open_desc %s: %s", g->ifname, strerror(errno)); + goto nonetmap; + } + D("got %u extra bufs at %u", g->nmd->reg.nr_extra_bufs, + g->nmd->nifp->ni_bufs_head); + + g->main_fd = g->nmd->fd; + D("mapped %lu at %p", (unsigned long)g->nmd->reg.nr_memsize>>10, g->nmd->mem); + + /* get num of queues in tx or rx */ + if (g->td_type == TD_TYPE_SENDER) + devqueues = g->nmd->reg.nr_tx_rings; + else + devqueues = g->nmd->reg.nr_rx_rings; + + /* validate provided nthreads. */ + if (g->nthreads < 1 || g->nthreads > devqueues) { + D("bad nthreads %d, have %d queues", g->nthreads, devqueues); + // continue, fail later + } + + if (g->verbose) { + struct netmap_if *nifp = g->nmd->nifp; + struct nmreq_register *reg = &g->nmd->reg; + + D("nifp at offset %lu, %d tx %d rx region %d", + reg->nr_offset, reg->nr_tx_rings, reg->nr_rx_rings, + reg->nr_mem_id); + for (i = 0; i <= reg->nr_tx_rings; i++) { + struct netmap_ring *ring = NETMAP_TXRING(nifp, i); + D(" TX%d at 0x%p slots %d", i, + (void *)((char *)ring - (char *)nifp), ring->num_slots); + } + for (i = 0; i <= reg->nr_rx_rings; i++) { + struct netmap_ring *ring = NETMAP_RXRING(nifp, i); + D(" RX%d at 0x%p slots %d", i, + (void *)((char *)ring - (char *)nifp), ring->num_slots); + } + } + + if (g->ifname2[0] != '\0') { + struct nmreq_header hdr; + struct nmreq_vale_attach reg; + int error; + size_t l = strlen("pst:") + strlen(g->ifname2); + + if (l + 1 > sizeof(hdr.nr_name)) { + g->main_fd = -1; + nmport_close(g->nmd); + goto nonetmap; + } + bzero(&hdr, sizeof(hdr)); + memcpy(hdr.nr_name, "pst:", strlen("pst:")); + memcpy(hdr.nr_name + strlen(hdr.nr_name), g->ifname2, + strlen(g->ifname2)); + hdr.nr_name[l] = '\0'; + hdr.nr_version = NETMAP_API; + hdr.nr_reqtype = NETMAP_REQ_PST_ATTACH; + hdr.nr_body = (uintptr_t)® + + bzero(®, sizeof(reg)); + reg.reg.nr_mem_id = g->nmd->reg.nr_mem_id; + reg.reg.nr_mode = NR_REG_NIC_SW; + error = ioctl(g->main_fd, NIOCCTRL, &hdr); + if (error < 0) { + perror("ioctl"); + D("failed in attach ioctl"); + nmport_close(g->nmd); + g->main_fd = -1; + } + } + +#endif /* !NOLIBNETMAP */ +nonetmap: + /* Print some debug information. */ + fprintf(stdout, + "%s %s: %d queues, %d threads and %d cpus.\n", "Working on", + g->ifname, + devqueues, + g->nthreads, + g->cpus); + /* return -1 if something went wrong. */ + if (g->dev_type == DEV_NETMAP && g->main_fd < 0) { + D("aborting"); + return -1; + } else if (g->td_type == TD_TYPE_DUMMY) { + D("this is dummy, %s and returning", + g->main_fd < 0 ? "failed" : "success"); + return 0; + } + + /* Install ^C handler. */ + global_nthreads = g->nthreads; + sigemptyset(&ss); + sigaddset(&ss, SIGINT); + /* block SIGINT now, so that all created threads will inherit the mask */ + if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) { + D("failed to block SIGINT: %s", strerror(errno)); + } + nm_start_threads(g); + + /* Install the handler and re-enable SIGINT for the main thread */ + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = sigint_h; + if (sigaction(SIGINT, &sa, NULL) < 0) { + D("failed to install ^C handler: %s", strerror(errno)); + } + + if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) { + D("failed to re-enable SIGINT: %s", strerror(errno)); + } + + nm_main_thread(g); + + for (i = 0; i < g->nthreads; i++) { + if (targs[i].opaque) + free(targs[i].opaque); + } + free(targs); + return 0; +} + + +#define IPV4TCP_HDRLEN 66 +static inline int +netmap_sendmsg (struct nm_msg *msgp, void *data, size_t len) +{ + struct netmap_ring *ring = (struct netmap_ring *) msgp->txring; + u_int cur = ring->cur; + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF_OFFSET(ring, slot) + IPV4TCP_HDRLEN; + + memcpy (p, data, len); + slot->len = IPV4TCP_HDRLEN + len; + nm_pst_setfd(slot, nm_pst_getfd(msgp->slot)); + nm_pst_setdoff(slot, IPV4TCP_HDRLEN); + ND("slot->buf_idx %u slot->len %u slot->fd %u", slot->buf_idx, slot->len, nm_pst_getfd(slot)); + ring->cur = ring->head = nm_ring_next(ring, cur); + return len; +} + +#define NM_NOEXTRA (~0U) +/* curp is reset when it wraps */ +static inline uint32_t +netmap_extra_next(struct nm_targ *t, size_t *curp, int wrap) +{ + uint32_t ret = t->extra_cur; + + if (unlikely(ret == t->extra_num)) { + if (!wrap) { + return NM_NOEXTRA; + } + ret = t->extra_cur = 0; + if (curp) { + *curp = 0; + } + } + t->extra_cur++; + return ret; +} + +#ifdef NMLIB_EXTRA_SLOT +static int inline +netmap_copy_out(struct nm_msg *nmsg) +{ + struct netmap_ring *ring = nmsg->rxring; + struct netmap_slot *slot = nmsg->slot; + struct nm_targ *t = nmsg->targ; + char *p, *ep; + uint32_t i = slot->buf_idx; + uint32_t extra_i = netmap_extra_next(t, (size_t *)&t->extra_cur, 0); + u_int off = nm_pst_getdoff(slot); + u_int len = slot->len; + struct netmap_slot tmp = {.buf_idx = extra_i}; + + if (extra_i == NM_NOEXTRA) + return -1; + NETMAP_WOFFSET(ring, &tmp, NETMAP_ROFFSET(ring, slot)); + p = NETMAP_BUF_OFFSET(ring, slot) + off; + ep = NETMAP_BUF_OFFSET(ring, &tmp) + off; + memcpy(ep, p, len - off); + for (i = 0; i < len - off; i += 64) { + _mm_clflush(ep + i); + } + return 0; +} + +/* XXX should we update nmsg->slot to new one? */ +static int inline +netmap_swap_out(struct nm_msg *nmsg) +{ + struct netmap_slot *slot = nmsg->slot, *extra, tmp; + struct nm_targ *t = nmsg->targ; + uint32_t extra_i = netmap_extra_next(t, (size_t *)&t->extra_cur, 0); + + if (extra_i == NM_NOEXTRA) + return -1; + tmp = *slot; + extra = &t->extra[extra_i]; + ND("%u is swaped with extra[%d] %u", i, extra_i, extra->buf_idx); + slot->buf_idx = extra->buf_idx; + slot->flags |= NS_BUF_CHANGED; + *extra = tmp; + return 0; +} +#endif /* NMLIB_EXTRA_SLOT */ + +static inline void +free_if_exist(void *p) +{ + if (p != NULL) + free(p); +} + +static int fdtable_expand(struct nm_targ *t) +{ + int *newfds, fdsiz = sizeof(*t->fdtable); + int nfds = t->fdtable_siz; + + newfds = (int *)calloc(nfds * 2, fdsiz); + if (!newfds) { + perror("calloc"); + return ENOMEM; + } + memcpy(newfds, t->fdtable, fdsiz * nfds); + free(t->fdtable); + //mm_mfence(); // XXX + t->fdtable = newfds; + t->fdtable_siz = nfds * 2; + return 0; +} + +#ifdef WITH_CLFLUSHOPT +static inline void +wait_ns(long ns) +{ + struct timespec cur, w; + + if (unlikely(ns > 10000 || ns < 100)) { + RD(1, "ns %ld may not be apprepriate", ns); + } + clock_gettime(CLOCK_REALTIME, &cur); + for (;;) { + clock_gettime(CLOCK_REALTIME, &w); + w = timespec_sub(w, cur); + if (unlikely(w.tv_sec < 0)) // maybe too short interval + continue; + else if (w.tv_nsec >= ns || w.tv_sec > 0) + break; + } +} +#endif /* WITH_CLFLUSHOPT */ + +static void +do_nm_rx_ring(struct nm_targ *t, int ring_nr) +{ + struct netmap_ring *rxr = NETMAP_RXRING(t->nmd->nifp, ring_nr); + struct netmap_ring *txr = NETMAP_TXRING(t->nmd->nifp, ring_nr); + u_int const rxtail = rxr->tail; + u_int rxcur = rxr->cur; + + for (; rxcur != rxtail; rxcur = nm_ring_next(rxr, rxcur)) { + struct netmap_slot *rxs = &rxr->slot[rxcur]; + struct nm_msg m = {.rxring = rxr, .txring = txr, .slot = rxs, .targ = t, .fd = nm_pst_getfd(rxs)} ; + + if (t->g->data) + t->g->data(&m); + nm_update_ctr(t, 1, rxs->len - nm_pst_getdoff(rxs)); + } + rxr->head = rxr->cur = rxcur; +#ifdef WITH_CLFLUSHOPT + _mm_mfence(); + if (t->g->emu_delay) { + wait_ns(t->g->emu_delay); + } +#endif /* WITH_CLFLUSHOPT */ +} + +static void +do_nm_tx_ring(struct nm_targ *t, int ring_nr) +{ + struct nm_msg m = {.txring = NETMAP_TXRING(t->nmd->nifp, ring_nr), + .targ = t}; + if (t->g->writable) + t->g->writable(&m); +} + +static int inline +soopton(int fd, int level, int type) +{ + const int on = 1; + + if (setsockopt(fd, level, type, &on, sizeof(int)) < 0) { + perror("setsockopt"); + return 1; + } + return 0; +} + +static int inline +do_setsockopt(int fd) +{ + const int on = 1; + struct linger sl = {.l_onoff = 1, .l_linger = 0}; + + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl))) + return -EFAULT; + if (soopton(fd, SOL_SOCKET, SO_REUSEADDR) || + soopton(fd, SOL_SOCKET, SO_REUSEPORT) || +#ifdef __FreeBSD__ + //soopton(fd, SOL_SOCKET, SO_REUSEPORT_LB) || +#endif /* __FreeBSD__ */ + soopton(fd, SOL_TCP, TCP_NODELAY)) + return -EFAULT; + if (ioctl(fd, FIONBIO, &on) < 0) { + perror("ioctl"); + return -EFAULT; + } + return 0; +} + +static int do_accept(struct nm_targ *t, int fd, int epfd) +{ +#ifdef linux + struct epoll_event ev; +#else + struct kevent ev; +#endif + struct sockaddr_in sin; + socklen_t addrlen; + int newfd; + //int val = 1; + while ((newfd = accept(fd, (struct sockaddr *)&sin, &addrlen)) != -1) { + //if (ioctl(fd, FIONBIO, &(int){1}) < 0) { + // perror("ioctl"); + //} + //int yes = 1; + //setsockopt(newfd, SOL_SOCKET, SO_BUSY_POLL, &yes, sizeof(yes)); + if (newfd >= t->fdtable_siz) { + if (fdtable_expand(t)) { + close(newfd); + break; + } + } + memset(&ev, 0, sizeof(ev)); +#ifdef linux + ev.events = POLLIN; + ev.data.fd = newfd; + epoll_ctl(epfd, EPOLL_CTL_ADD, newfd, &ev); +#else + EV_SET(&ev, newfd, EVFILT_READ, EV_ADD, 0, 0, NULL); + kevent(epfd, &ev, 1, NULL, 0, NULL); +#endif + } + return 0; +} + +#define DEFAULT_NFDS 65535 +#define ARRAYSIZ(a) (sizeof(a) / sizeof(a[0])) +static void * +netmap_worker(void *data) +{ + struct nm_targ *t = (struct nm_targ *) data; + struct nm_garg *g = t->g; + struct nmport_d *nmd = t->nmd; + struct pollfd pfd[2] = {{ .fd = t->fd }}; // XXX make variable size + struct nmreq_header hdr = g->nm_hdr; +#if DEBUG_SOCKET + int acceptfds[DEFAULT_NFDS]; + + bzero(acceptfds, sizeof(acceptfds)); +#endif /* DEBUG_SOCKET */ + + if (g->thread) { + int error = g->thread(t); + if (error) { + D("error on t->thread"); + goto quit; + } + } + + /* allocate fd table */ + t->fdtable = (int *)calloc(DEFAULT_NFDS, sizeof(*t->fdtable)); + if (!t->fdtable) { + perror("calloc"); + goto quit; + } + t->fdtable_siz = DEFAULT_NFDS; + + /* import extra buffers */ + if (g->dev_type == DEV_NETMAP) { + const struct nmreq_register *reg = &nmd->reg; + const struct netmap_if *nifp = nmd->nifp; + //const struct netmap_ring *any_ring = nmd->some_ring; + const struct netmap_ring *any_ring = NETMAP_TXRING(nmd->nifp, nmd->first_tx_ring); + uint32_t next = nifp->ni_bufs_head; + const u_int n = reg->nr_extra_bufs; + uint32_t i; + + D("have %u extra buffers from %u ring %p", n, next, any_ring); +#ifdef NMLIB_EXTRA_SLOT + t->extra = (struct netmap_slot *)calloc(n, sizeof(*t->extra)); +#else + t->extra = (uint32_t *)calloc(n, sizeof(*t->extra)); +#endif + if (!t->extra) { + perror("calloc"); + goto quit; + } + for (i = 0; i < n && next; i++) { + char *p; + struct netmap_slot tmp = any_ring->slot[0]; + tmp.ptr = 0; // XXX +#ifdef NMLIB_EXTRA_SLOT + t->extra[i].buf_idx = next; +#else + t->extra[i] = next; +#endif + tmp.buf_idx = next; + p = NETMAP_BUF_OFFSET(any_ring, &tmp); + next = *(uint32_t *)p; + } + t->extra_num = i; + D("imported %u extra buffers", i); + } else if (g->dev_type == DEV_SOCKET) { +#ifdef linux + struct epoll_event ev; + + t->fd = epoll_create1(EPOLL_CLOEXEC); + if (t->fd < 0) { + perror("epoll_create1"); + t->cancel = 1; + goto quit; + } + + /* XXX make variable ev num. */ + bzero(&ev, sizeof(ev)); + ev.events = POLLIN; + ev.data.fd = g->fds[0]; + if (epoll_ctl(t->fd, EPOLL_CTL_ADD, ev.data.fd, &ev)) { + perror("epoll_ctl"); + t->cancel = 1; + goto quit; + } +#else /* !linux */ + struct kevent ev; + + t->fd = kqueue(); + if (t->fd < 0) { + perror("kqueue"); + t->cancel = 1; + goto quit; + } + EV_SET(&ev, g->fds[0], EVFILT_READ, EV_ADD, 0, 0, NULL); + if (kevent(t->fd, &ev, 1, NULL, 0, NULL)) { + perror("kevent"); + t->cancel = 1; + goto quit; + } +#endif /* linux */ + } + + /* + * register connected sockets + */ + if (g->dev_type == DEV_NETMAP) { + u_int i; + for (i = 0; i < t->g->cfdnum; i++) { + struct nmreq_pst_fd_reg fdr; + + fdr.fd = t->g->cfds[i]; + hdr.nr_body = (uintptr_t)&fdr; + if (ioctl(t->fd, NIOCCTRL, &hdr)) { + perror("ioctl"); + D("error i %d fd %d", i, t->g->cfds[i]); + } else { + D("registered %d fd %d", i, t->g->cfds[i]); + } + } + } + + while (!t->cancel) { + struct nm_msg msg; + + if (g->dev_type == DEV_NETMAP) { + u_int first_ring = nmd->first_rx_ring; + u_int last_ring = nmd->last_rx_ring; + u_int i; + struct netmap_slot slot; + int n; + + pfd[0].fd = t->fd; + pfd[0].events = t->g->pollevents; + /* XXX make safer... */ + for (i = 0; i < t->g->fdnum; i++) { + pfd[i+1].fd = t->g->fds[i]; + pfd[i+1].events = POLLIN; + } + n = poll(pfd, i+1, t->g->polltimeo); + if (n < 0) { + perror("poll"); + goto quit; + } + /* + * check listen sockets + */ + for (i = 1; i <= t->g->fdnum; i++) { + struct sockaddr_storage tmp; + struct sockaddr *sa = (struct sockaddr *)&tmp; + int newfd; + socklen_t len = sizeof(tmp); + int e; + struct nmreq_pst_fd_reg fdr; + + if (!(pfd[i].revents & POLLIN)) + continue; + newfd = accept(pfd[i].fd, sa, &len); + if (newfd < 0) { + RD(1, "accept error"); + /* ignore this socket */ + continue; + } + + fdr.fd = newfd; + hdr.nr_body = (uintptr_t)&fdr; + e = ioctl(t->fd, NIOCCTRL, &hdr); + if (e) { + perror("ioctl"); + if (errno == ENOTCONN) { + D("ENOTCONN closing newfd %d", newfd); + close(newfd); + } else if (errno == ENOMEM) { + D("ENOMEM closing newfd %d", newfd); + close(newfd); +close_pfds: + for (i = 1; i < g->fdnum; i++) { + close(pfd[i].fd); + } + goto quit; + } else { + D("undefined error %d", errno); + } + } + if (unlikely(newfd >= t->fdtable_siz)) { + if (fdtable_expand(t)) { + goto close_pfds; + } + } + nm_pst_setfd(&slot, newfd); + msg.slot = &slot; + if (g->connection) + g->connection(&msg); +#if DEBUG_SOCKET + acceptfds[newfd] = newfd; +#endif + } + + /* check the netmap fd */ + if (pfd[0].revents & POLLIN) { + for (i = first_ring; i <= last_ring; i++) { + do_nm_rx_ring(t, i); + } + } + if (pfd[0].revents & POLLOUT) { + for (i = first_ring; i <= last_ring; i++) { + do_nm_tx_ring(t, i); + } + } + } else if (g->dev_type == DEV_SOCKET) { + int i, nfd, epfd = t->fd; + int nevts = ARRAYSIZ(t->evts); +#ifdef linux + struct epoll_event *evts = t->evts; + + nfd = epoll_wait(epfd, evts, nevts, g->polltimeo); + if (nfd < 0) { + perror("epoll_wait"); + goto quit; + } +#else + struct kevent *evts = t->evts; + + nfd = kevent(epfd, NULL, 0, evts, nevts, g->polltimeo_ts); +#endif + for (i = 0; i < nfd; i++) { + u_int j; +#ifdef linux + int fd = evts[i].data.fd; +#else + int fd = evts[i].ident; +#endif + + for (j = 0; j < t->g->fdnum; j++) { + if (fd != t->g->fds[j]) { + continue; + } + do_accept(t, fd, epfd); + break; + } + if (j != t->g->fdnum) + continue; + msg.fd = fd; + msg.targ = t; + g->read(&msg); + } + } + } +#if DEBUG_SOCKET + if (t->cancel) { + int i; + D("canceled, closing sockets"); + for (i = 0; i < DEFAULT_NFDS; i++) { + close(acceptfds[i]); + } + } +#endif /* DEBUG_SOCKET */ +quit: + free_if_exist(t->extra); + free_if_exist(t->fdtable); + return (NULL); +} + +// XXX inline just to scilence compiler +static inline void * +do_mmap(int fd, size_t len) +{ + void *p; + + if (lseek(fd, len -1, SEEK_SET) < 0) { + perror("lseek"); + return NULL; + } + if (write(fd, "", 1) != 1) { + perror("write"); + return NULL; + } + p = mmap(0, len, PROT_WRITE, MAP_SHARED | MAP_FILE, fd, 0); + if (p == MAP_FAILED) { + perror("mmap"); + return NULL; + } + return p; +} + + +/* + * Highest level abstraction mainly for PASTE + * + * ifname: netmap port name with prefix (e.g., pst:) + * and suffix (e.g., @/mnt/pm/x). + * ret: pointer to nm_garg allocated + * error: error value + * fds: array of listening file descriptors monitored by poll(). + * fdnum: number of file descriptors in fds. + */ +static void +netmap_eventloop(const char *name, char *ifname, void **ret, int *error, + int *fds, int fdnum, int *cfds, int cfdnum, + struct nm_garg *args, void *garg_private) +{ + struct nm_garg *g = (struct nm_garg *)calloc(1, sizeof(*g)); + int i; + struct nmreq_header hdr; + struct nmctx ctx; +#ifndef NOLIBNETMAP + const char *namep = name; +#endif + + *error = 0; + if (!g) { + perror("calloc"); + *error = -ENOMEM; + return; + } + +#define B(a, v, l, h, d) \ + (!(a) ? d : (((a)->v >= l && (a)->v <= h) ? (a)->v : d)) + g->polltimeo = B(args, polltimeo, 0, 2000, 1000); + g->dev_type = B(args, dev_type, 0, DEV_SOCKET, DEV_SOCKET); + g->nthreads = B(args, nthreads, 1, 128, 1); + g->affinity = B(args, affinity, -1, 128, -1); + g->extmem_siz = B(args, extmem_siz, 0, 8192000000000UL, 0); + g->extra_bufs = B(args, extra_bufs, 0, 4096000000UL, 0); + g->ring_objsize = B(args, ring_objsize, RING_OBJSIZE/4, + RING_OBJSIZE*2, RING_OBJSIZE); +#undef B + g->targ_opaque_len = args->targ_opaque_len; + g->nmr_config = args->nmr_config; + g->extmem = args->extmem; + g->td_body = netmap_worker; + g->connection = args->connection; + g->data = args->data; + g->read = args->read; + g->thread = args->thread; + g->writable = args->writable; + g->fds = fds; + g->fdnum = fdnum; + g->cfds = cfds; + g->cfdnum = cfdnum; + g->pollevents = args->pollevents ? args->pollevents : POLLIN; +#ifdef __FreeBSD__ + g->polltimeo_ts = args->polltimeo_ts; +#endif /* FreeBSD */ +#ifdef WITH_CLFLUSHOPT + g->emu_delay = args->emu_delay; +#endif /* WITH_CLFLUSHOPT */ + *ret = g; + + for (i = 0; i < fdnum; i++) { + if (do_setsockopt(fds[i]) < 0) { + perror("setsockopt"); + *error = -EFAULT; + return; + } + } + for (i = 0; i < cfdnum; i++) { + if (do_setsockopt(cfds[i]) < 0) { + perror("setsockopt"); + *error = -EFAULT; + return; + } + } + + signal(SIGPIPE, SIG_IGN); + + /* Ensure correct name. Suffix may be added in nm_start() later */ + bzero(&ctx, sizeof(ctx)); + bzero(&hdr, sizeof(hdr)); +#ifndef NOLIBNETMAP + if (nmreq_header_decode(&namep, &hdr, &ctx) < 0) { + *error = -EINVAL; + return; + } +#endif + + strncpy(g->ifname, name, sizeof(g->ifname) - 1); + D("name %s g->ifname %s ifname %s", name, g->ifname, ifname); + if (ifname && strlen(ifname)) { + struct nmreq_header *h = &g->nm_hdr; + + strncpy(g->ifname2, ifname, sizeof(g->ifname2)); + /* pre-initialize ifreq for accept() */ + bzero(h, sizeof(*h)); + memcpy(h->nr_name, hdr.nr_name, sizeof(h->nr_name)); + h->nr_version = NETMAP_API; + h->nr_reqtype = NETMAP_REQ_PST_FD_REG; + // nr_body is per thread + } + g->garg_private = garg_private; + *error = nm_start(g); +} + +/* + * General routine to write data to netmap buffer(s). + * Data is written after `off` except for the first chunk, which + * is written after `off0` bytes. This is useful when the caller writes + * an app-level header beforehand + */ +const u_int DEFAULT_MTU = 1520; // maximum option space +static inline int +nm_write(struct netmap_ring *ring, const char *data, + size_t len, u_int off0, int fd) +{ + u_int const tail = ring->tail; + u_int cur = ring->cur; + size_t copied = 0; + const u_int space = nm_ring_space(ring); + size_t space_bytes; + const u_int off = IPV4TCP_HDRLEN; + + //if (unlikely(off + off0 > DEFAULT_MTU)) { + // D("total offset must be < %u", DEFAULT_MTU); + //} else if (unlikely(off > DEFAULT_MTU)) { + // D("offset must be < %u", DEFAULT_MTU); + //} + + space_bytes = (DEFAULT_MTU - off) * space - off0; + if (unlikely(!space || space_bytes < len)) { + RD(1, "no space (%d slots)", space); + return -1; + } + + while (likely(cur != tail) && copied < len) { + struct netmap_slot *slot = &ring->slot[cur]; + char *p = NETMAP_BUF_OFFSET(ring, slot) + off + off0; + int l = len - copied; + + if (l > (int)(DEFAULT_MTU - off0)) + l = (int)(DEFAULT_MTU - off0); + if (data) { + nm_pkt_copy(data + copied, p, l); + } + slot->len = off + off0 + l; + nm_pst_setdoff(slot, off); + nm_pst_setfd(slot, fd); + copied += l; + off0 = 0; + cur = nm_ring_next(ring, cur); + } + ring->cur = ring->head = cur; + return len; +} + +static inline struct netmap_slot * +nm_zcopy(struct netmap_ring *txr, struct netmap_slot *slot) +{ + struct netmap_slot tmp, *txs = NULL; + + if (unlikely(nm_ring_space(txr) == 0)) { + return NULL; + } + txs = &txr->slot[txr->cur]; + if (likely(slot != txs)) { + tmp = *txs; + *txs = *slot; + txs->flags |= NS_BUF_CHANGED; + *slot = tmp; + slot->flags |= NS_BUF_CHANGED; // might be on-ring + } + txr->cur = txr->head = nm_ring_next(txr, txr->cur); + return txs; +} +#endif /* _NMLIB_H_ */ diff --git a/apps/phttpd/phttpd.c b/apps/phttpd/phttpd.c new file mode 100644 index 000000000..dae33d4c8 --- /dev/null +++ b/apps/phttpd/phttpd.c @@ -0,0 +1,1001 @@ +/* + * Copyright (C) 2016-2017 Michio Honda. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#ifdef __FreeBSD__ +#include +#include +#endif /* __FreeBSD__ */ +#include +#include +#include +#include +#define NMLIB_EXTRA_SLOT 1 +#include "nmlib.h" +#ifdef WITH_BPLUS +#include +#include +#endif /* WITH_BPLUS */ +#ifdef WITH_NOFLUSH +#define _mm_clflush(p) (void)(p) +#define _mm_mfence() (0) +#endif +#ifdef WITH_CLFLUSHOPT +#define _mm_clflush(p) _mm_clflushopt(p) +#endif +#ifdef WITH_LEVELDB +#include +#include +#endif /* WITH_LEVELDB */ + +//#define MYHZ 2400000000 +#ifdef MYHZ +static __inline unsigned long long int rdtsc(void) +{ + unsigned a, d; + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((unsigned long long)a) | (((unsigned long long)d) << 32);; +} + +static inline void +user_clock_gettime(struct timespec *ts) +{ + unsigned long long now; + + now = rdtsc(); + ts->tv_sec = now/MYHZ; + ts->tv_nsec = (now%MYHZ)*1000000000/MYHZ; +} +#endif /* MYHZ */ + +#define PST_NAME "pst:0" +#define EXTMEMFILE "netmap_mem" +#define BPLUSFILE "bplus" +#define DATAFILE "dumb" +#define LEVELDBFILE "leveldb" +#define LEVELDBMEMFILE "leveldb_mem" + +#define NETMAP_BUF_SIZE 2048 +#define GET_LEN 4 // the request look like GET /3 +#define POST_LEN 5 + +#define EPOLLEVENTS 2048 +#define MAXQUERYLEN 32767 + +#define MAX_HTTPLEN 65535 + +#define DF_FDSYNC 0x1 +#define DF_PASTE 0x2 +#define DF_BPLUS 0x4 +#define DF_MMAP 0x10 +#define DF_PMEM 0x20 +#define DF_LEVELDB 0x40 + +#define CLSIZ 64 /* XXX */ + +struct dbctx { + int flags; + size_t size; + size_t pgsiz; + int i; + int fd; + char *paddr; + void *vp; // gfile_t +#ifdef WITH_LEVELDB + leveldb::DB *leveldb; +#endif /* WITH_LEVELDB */ + size_t cur; +}; + +struct phttpd_global { + char ifname[NETMAP_REQ_IFNAMSIZ]; + int extmemfd; + int sd; + char *http; + int httplen; + int msglen; + struct { + int flags; + size_t size; + char *dir; // directory path for data, metadata ane ppool + } dba; +}; + +static inline int +is_pm(struct dbctx *d) +{ + return !!(d->flags & DF_PMEM); +} + +static inline size_t +get_aligned(size_t len, size_t align) +{ + size_t d = len & (align - 1); + return d ? len + align - d : len; +} + +#if 0 +static u_int stat_nfds; +static u_int stat_eps; +static u_int stat_maxnfds; +static u_int stat_minnfds; +static uint64_t stat_vnfds; +#endif /* 0 */ + +static char *HTTPHDR = (char *)"HTTP/1.1 200 OK\r\n" + "Connection: keep-alive\r\n" + "Server: Apache/2.2.800\r\n" + "Content-Length: "; +#define HTTPHDR_LEN 81 + +ssize_t +generate_httphdr(size_t content_length, char *buf) +{ + char *c = buf; + c = mempcpy(c, HTTPHDR, HTTPHDR_LEN); + c += sprintf(c, "%lu\r\n\r", content_length); + *c++ = '\n'; + return c - buf; +} + +#define SKIP_POST 48 +static int +parse_post(char *post, const size_t len, + size_t *coff, size_t *clen, size_t *thisclen) +{ + char *pp, *p = strstr(post + SKIP_POST, (char *)"Content-Length: "); + char *end; + + *coff = 0; + if (unlikely(!p)) + return -1; + pp = p + 16; // strlen("Content-Length: ") + *clen = strtol(pp, &end, 10); + if (unlikely(end == pp)) + return -1; + pp = strstr(pp, "\r\n\r\n"); + if (unlikely(!pp)) + return -1; + pp += 4; + *coff = pp - post; + *thisclen = len - *coff; + return 0; +} + +static void +usage(void) +{ + fprintf(stderr, + "Usage:\n" + "\t[-P port] TCP listen port (default 60000)\n" + "\t[-l size] message length excluding HTTP header (default 64)\n" + "\t[-b ms] timeout in poll(2), kqueue(2) or epoll_wait(2) (default 2000)\n" + "\t[-d path] database directory (e.g., /mnt/pmem)\n" + "\t\tTo recognize PM, the path string must include \"pm\"\n" + "\t[-L MB] size of database file given by -d\n" + "\t[-i name] netmap(2) port name. This indicates use of PASTE\n" + "\t[-x] use extmem. 32B per buffer is reserved for metadata\n" + "\t[-a affinity] (same semantics with pkt-gen)\n" + "\t[-p nthreads] (same semantics with pkt-gen)\n" + "\t[-C config] virtual port configuration in vale-ctl(8) syntax\n" + "\t[-m] mmap(2) database given by -d. For PM, always specify\n" + "\t[-D] use fdatasync(2) instead of fsync\n" + "\t[-c] static HTTP header\n" + "\t[-B] use B+tree (need phttpd-b)\n" + "\t[-F] don't clflush on PM (need phttpd-f) \n" + "\t[-e ns] emulate PM access time (need phttpd-o)\n" + "\t[-h] show this help\n" + + "\nExamples:\n" + "\t1. No database or PASTE\n\n" + "\t# phttpd -b 0 -c\n\n" + "\t2. PASTE but w/o any database\n\n" + "\t# phttpd -b 0 -i eth1 -c\n\n" + "\t3. WAL and copy\n\n" + "\t# phttpd -b 0 -i eth1 -d /mnt/pmem -L 768 -m -c \n\n" + "\t where /mnt/pmem must be on a DAX-enabled filesystem on (emulated)\n" + "\t PM and have at least 8 GB capacity\n\n" + "\t4. WAL w/o copy\n\n" + "\t# phttpd -b 0 -i eth1 -d /mnt/pmem -x -L 768 -m -c \n\n" + "\t5. WAL w/o copy and four CPU cores/threads\n\n" + "\t# phttpd -b 0 -i eth1 -d /mnt/pmem -x -L 768 -m -c -C 0,0,4,4 -p 4\n\n" + "\t where the underlying NIC must have at least 4 queues\n\n" + "\t6. B+tree w/o copy\n\n" + "\t# phttpd-b -b 0 -i eth1 -d /mnt/pmem -x -L 768 -m -c -B\n\n" + + "\nTips:\n" + "\t1. wrk HTTP benchmark tool is useful as the client.\n" + "\t To generate HTTP POST traffic, use lua script like:\n\n" + "\twrk.method = \"POST\"\n" + "\ts = \"foo=bar&baz=quux\"\n" + "\twrk.body = s\n" + "\tfor i = 0, 79 do\n" + "\t\twrk.body = wrk.body..s\n" + "\tend\n" + "\twrk.headers[\"Content-Type\"] = \"application/x-www-form-urlencoded\"\n\n" + "\t This script passed to wrk with -s generates 1280B HTTP POSTs\n\n" + + "\t2. Make sure all the hardware offloading are disabled except for\n" + "\t tx-checksum-ip-generic (e1000, ixgbe) or tx-checksum-ipv4 (i40e)\n" + "\t in Linux.\n\n" + + "\t3. When using busy-polling (-b 0), set NIC interrupt interval\n" + "\t as long as possible (PASTE) or as short as possible (w/o PASTE)\n\n" + + ); + + exit(1); +} + +static int +writesync(char *buf, ssize_t len, size_t space, int fd, size_t *pos, int fdsync) +{ + int error; + size_t cur = *pos; + + if (unlikely(cur + len > space)) { + if (lseek(fd, 0, SEEK_SET) < 0) { + perror("lseek"); + return -1; + } + cur = 0; + } + len = write(fd, buf, len); + if (unlikely(len < 0)) { + perror("write"); + return -1; + } + cur += len; + error = fdsync ? fdatasync(fd) : fsync(fd); + if (unlikely(error)) { + fprintf(stderr, "failed in f%ssync\n", fdsync ? "data" : ""); + return -1; + } + *pos = cur; + return 0; +} + +static inline uint64_t +pack(uint32_t idx, uint16_t off, uint16_t len) +{ + return (uint64_t)idx << 32 | off << 16 | len; +} + +#ifdef WITH_BPLUS +static inline void +nmidx_bplus(gfile_t *vp, btree_key key, uint32_t bufidx, size_t off, size_t len) +{ + uint64_t packed; + static int unique = 0; + int rc; + + packed = pack(bufidx, off, len); + rc = btree_insert(vp, key, packed); + if (rc == 0) + unique++; + ND("k %lu v %lu idx %u off %lu len %lu", key, packed, bufidx, off, len); +} +#endif /* WITH_BPLUS */ + +static inline void +nmidx_wal(char *paddr, size_t *pos, size_t dbsiz, uint32_t bufidx, + size_t off, size_t len) +{ + uint64_t packed; + size_t cur = *pos; + size_t plen = sizeof(packed); + char *p = paddr; + + /* make log */ + packed = pack(bufidx, off, len); + /* position log */ + if (unlikely(plen > dbsiz - cur)) + cur = 0; + p += cur; + *(uint64_t *)p = packed; + _mm_clflush(p); + *pos = cur + plen; +} + +static inline void +copy_and_log(char *paddr, size_t *pos, size_t dbsiz, char *buf, size_t len, + size_t pagesiz, int pm, void *vp, uint64_t key) +{ + char *p; + int mlen = vp ? 0 : sizeof(uint64_t); + size_t cur = *pos; + u_int i = 0; + size_t aligned = len; + size_t align = pagesiz; + + if (pm) { +#ifdef WITH_BPLUS + if (vp) + align = NETMAP_BUF_SIZE; + else +#endif + align = 0; + } + if (align > 0) { + aligned = get_aligned(len, align); + } + /* Do we have a space? */ + if (unlikely(cur + aligned + mlen > dbsiz)) { + cur = 0; + } + p = paddr + cur + mlen; // leave a log entry space + memcpy(p, buf, len); + if (pm) { + for (; i < len; i += CLSIZ) { + _mm_clflush(p + i); + } + } + p -= mlen; + if (!pm) { + if (msync(p, len + mlen, MS_SYNC)) + perror("msync"); + } +#ifdef WITH_BPLUS + if (vp) { + static int unique = 0; + uint64_t packed = pack(cur/NETMAP_BUF_SIZE, 0, len); + int rc = btree_insert((gfile_t *)vp, key, packed); + if (rc == 0) { + unique++; + } + } else +#endif + { + *(uint64_t *)p = len; + if (pm) + _mm_clflush(p); + else { + msync(p, sizeof(size_t), MS_SYNC); + } + } + *pos = cur + aligned + (align ? 0 : mlen); +} + +enum http {NONE=0, POST, GET}; +static __inline int +httptype(const char *p) +{ + enum http type = NONE; + + if (!strncmp(p, "POST ", POST_LEN)) { + type = POST; + } else if (!strncmp(p, "GET ", GET_LEN)) { + type = GET; + } + return type; +} + +static int +phttpd_req(char *req, int len, struct nm_msg *m, int *no_ok, + size_t *msglen, char **content) +{ + struct dbctx *db = (struct dbctx *)m->targ->opaque; + int *fde = &m->targ->fdtable[m->fd]; + char *datap; + + const int flags = db->flags; + const size_t dbsiz = db->size; + + *no_ok = 0; + + switch (httptype(req)) { + uint64_t key; + size_t coff, clen, thisclen; + + case NONE: + if (unlikely(*fde <= 0)) { + *no_ok = 1; + *fde = 0; + break; + } + *fde -= len; + if (unlikely(*fde < 0)) { + D("bad leftover %d (len %d)", *fde, len); + *fde = 0; + } else if (*fde > 0) { + *no_ok = 1; + } + break; + case POST: + if (parse_post(req, len, &coff, &clen, &thisclen)) { + return 0; + } + if (clen > thisclen) { + *fde = clen - thisclen; + *no_ok = 1; + } + datap = req + coff; + key = *(uint64_t *)datap; + + if (flags & DF_PASTE) { + u_int i = 0; + struct netmap_slot tmp, *extra, *slot = m->slot; + uint32_t xi = netmap_extra_next(m->targ, &db->cur, 1); + const u_int off = NETMAP_ROFFSET(m->rxring, slot) + + nm_pst_getdoff(slot) + coff; + /* flush data buffer */ + for (; i < thisclen; i += CLSIZ) { + _mm_clflush(datap + i); + } +#ifdef WITH_BPLUS + if (db->vp) { + nmidx_bplus((gfile_t *)db->vp, key, + slot->buf_idx, off, thisclen); + } else +#endif + if (db->paddr) { + nmidx_wal(db->paddr, &db->cur, dbsiz, + slot->buf_idx, off, thisclen); + } + + /* swap out buffer */ + extra = &m->targ->extra[xi]; + tmp = *slot; + slot->buf_idx = extra->buf_idx; + slot->flags |= NS_BUF_CHANGED; + *extra = tmp; + extra->flags &= ~NS_BUF_CHANGED; +#ifdef WITH_LEVELDB + } else if (db->leveldb) { + leveldb::Slice skey((char *)&key, sizeof(key)); + leveldb::Slice sval((char *)&req, thisclen); + leveldb::Status status; + leveldb::WriteOptions write_options; + write_options.sync = true; + status = db->leveldb->Put(write_options, skey, sval); + if (!status.ok()) { + D("leveldb write error"); + } +#endif /* WITH_LEVELDB */ + } else if (db->paddr) { + copy_and_log(db->paddr, &db->cur, dbsiz, datap, + thisclen, db->pgsiz, is_pm(db), db->vp, key); + } else if (db->fd > 0) { + if (writesync(datap, len, dbsiz, db->fd, + &db->cur, flags & DF_FDSYNC)) { + return -1; + } + } else { + RD(1, "no db to save POST"); + } + break; + case GET: +#ifdef WITH_BPLUS + { + uint32_t _idx; + uint16_t _off, _len; + uint64_t datam = 0; + int rc; + + if (!db->vp) + break; + key = *(uint64_t *)(req + GET_LEN + 1); // jump '/' + rc = btree_lookup((gfile_t *)db->vp, key, &datam); + if (rc == ENOENT) + break; + _idx = datam >> 32; + _off = (datam & 0x00000000ffff0000) >> 16; + _len = datam & 0x000000000000ffff; + ND("found key %lu val %lu idx %u off %u len %u", + key, datam, _idx, _off, _len); + *msglen = _len; + if (flags & DF_PASTE) { + *content = NETMAP_BUF(m->rxring, _idx) + _off; + } else { + *content = db->paddr + NETMAP_BUF_SIZE * _idx; + } + } +#endif /* WITH_BPLUS */ + break; + default: + break; + } + return 0; +} + +static int +phttpd_data(struct nm_msg *m) +{ + struct phttpd_global *pg = (struct phttpd_global *) + m->targ->g->garg_private; + size_t msglen = pg->msglen, len = 0; + int error, no_ok = 0; + char *content = NULL; + u_int doff = nm_pst_getdoff(m->slot); +#ifdef MYHZ + struct timespec ts1, ts2, ts3; + user_clock_gettime(&ts1); +#endif + + len = m->slot->len - doff; + if (unlikely(len == 0)) { + close(m->fd); + return 0; + } + + error = phttpd_req(NETMAP_BUF_OFFSET(m->rxring, m->slot) + doff, + len, m, &no_ok, &msglen, &content); + if (unlikely(error)) { + return error; + } + if (!no_ok) { + int httplen = pg->httplen; + struct netmap_ring *txr = m->txring; + char *p = NETMAP_BUF_OFFSET(txr, &txr->slot[txr->cur]) + + IPV4TCP_HDRLEN; + if (pg->http) { + memcpy(p, pg->http, httplen); + } else { + httplen = generate_httphdr(msglen, p); + } + len = nm_write(txr, content, msglen, httplen, m->fd); + if (unlikely(len < msglen)) { + D("no space"); + } + } +#ifdef MYHZ + user_clock_gettime(&ts2); + ts3 = timespec_sub(ts2, ts1); +#endif /* MYHZ */ + return 0; +} + +/* We assume GET/POST appears in the beginning of netmap buffer */ +static int +phttpd_read(struct nm_msg *m) +{ + struct phttpd_global *pg = (struct phttpd_global *) + m->targ->g->garg_private; + size_t msglen = pg->msglen, len = 0; + int error, no_ok = 0; + char *content = NULL; + char buf[MAXQUERYLEN]; + + len = read(m->fd, buf, sizeof(buf)); + if (len <= 0) { + close(m->fd); + return len == 0 ? 0 : -1; + } + + error = phttpd_req(buf, len, m, &no_ok, &msglen, &content); + if (unlikely(error)) + return error; + if (!no_ok) { + int httplen = pg->httplen; + + if (pg->http) { + memcpy(buf, pg->http, httplen); + } else { + httplen = generate_httphdr(msglen, buf); + } + if (content) { + memcpy(buf + httplen, content, msglen); + } +#ifdef WITH_CLFLUSHOPT + _mm_mfence(); + if (m->targ->g->emu_delay) { + wait_ns(m->targ->g->emu_delay); + } +#endif + len = write(m->fd, buf, httplen + msglen); + if (unlikely(len < 0)) { + perror("write"); + } else if (unlikely(len < httplen + msglen)) { + RD(1, "written %ld len %ld", len, httplen + msglen); + } + } + return 0; +} + +static int +init_db(struct dbctx *db, int i, const char *dir, int flags, size_t size) +{ + int fd = 0; + char path[64]; + + if (!dir) + return 0; + bzero(db, sizeof(*db)); + db->flags = flags; + db->size = size; + db->pgsiz = getpagesize(); + +#ifdef WITH_LEVELDB + if (db->flags & DF_LEVELDB) { + leveldb::Status status; + leveldb::Options options; + char mpath[64]; + std::string val; + + options.create_if_missing = true; + // 16GB + options.write_buffer_size = 16384000000; + options.nvm_buffer_size = 16384000000; + options.reuse_logs = true; + snprintf(path, sizeof(path), "%s/%s%d", dir, LEVELDBFILE, i); + snprintf(mpath, sizeof(mpath), "%s/%s%d", dir, LEVELDBMEMFILE, i); + status = leveldb::DB::Open(options, path, mpath, &db->leveldb); + if (!status.ok()) { + D("error to open leveldb %s", path); + return -1; + } + D("done Open LevelDB dbfile %s memfile %s", path, mpath); + + leveldb::WriteOptions write_options; + write_options.sync = false; + status = db->leveldb->Put(write_options, "100", "test"); + if (!status.ok()) { + D("leveldb write error"); + } + status = db->leveldb->Get(leveldb::ReadOptions(), "100", &val); + if (!status.ok()) { + D("leveldb read error"); + } + status = db->leveldb->Delete(leveldb::WriteOptions(), "100"); + if (!status.ok()) { + D("leveldb write error"); + } + D("leveldb test done (error reported if any)"); + } +#endif +#ifdef WITH_BPLUS + /* need B+tree ? */ + if (db->flags & DF_BPLUS) { + int rc; + snprintf(path, sizeof(path), "%s/%s%d", dir, BPLUSFILE, i); + rc = btree_create_btree(path, ((gfile_t **)&db->vp)); + D("btree_create_btree() done (%d) %s", rc, path); + if (rc != 0) + return -1; + else if (db->flags & DF_PASTE) + return 0; + } +#endif /* WITH_BPLUS */ + snprintf(path, sizeof(path), "%s/%s%d", dir, DATAFILE, i); + fd = open(path, O_RDWR | O_CREAT, S_IRWXU); + if (fd < 0) { + perror("open"); + return -1; + } + if (db->flags & DF_MMAP) { + if (fallocate(fd, 0, 0, db->size) < 0) { + perror("fallocate"); + close(fd); + return -1; + } + db->paddr = (char *)do_mmap(fd, db->size); + if (db->paddr == NULL) { + close(fd); + return -1; + } + } + db->fd = fd; + return 0; +} + +static int +phttpd_thread(struct nm_targ *targ) +{ + struct nm_garg *nmg = targ->g; + struct phttpd_global *g = + (struct phttpd_global *)nmg->garg_private; + + if (init_db((struct dbctx *)targ->opaque, targ->me, g->dba.dir, + g->dba.flags, g->dba.size / nmg->nthreads)) { + D("error on init_db"); + return ENOMEM; + } + return 0; +} + +void +clean_dir(char *dirpath) +{ + DIR *dp; + struct dirent *ent; + + if (!dirpath) + return; + if ((dp = opendir(dirpath)) == NULL) { + return; + } + while ((ent = readdir(dp))) { + char fullp[256]; // XXX + size_t l; + + if (ent->d_name[0] == '.') + continue; + else if (strstr(ent->d_name, EXTMEMFILE) == NULL && + strstr(ent->d_name, BPLUSFILE) == NULL && + strstr(ent->d_name, DATAFILE) == NULL) + continue; + strncat(strncpy(fullp, dirpath, sizeof(fullp) - 2), "/", 2); + l = strlen(fullp) + strlen(ent->d_name) + 1; + if (l < sizeof(fullp)) { + strncat(fullp, ent->d_name, l); + } + //strncat(fullp, ent->d_name, sizeof(fullp) - strlen(fullp) - 1); + D("removing %s", fullp); + if (unlink(fullp)) + perror("unlink"); + } +} + +char * +chkpath(char *dir, char *f, size_t size, int rm) +{ + int fd, mode = O_RDWR|O_CREAT; + char *path; + + if (asprintf(&path, "%s/%s", dir, f) < 0) { + return NULL; + } + if ((fd = open(path, mode, S_IRWXU)) < 0) { + perror("open"); + free(path); + return NULL; + } + if (fallocate(fd, 0, 0, size)) { + D("fallocate %s failed size %lu", path, size); + close(fd); + free(path); + return NULL; + } + if (rm) + unlink(path); + close(fd); + return path; +} + +int +main(int argc, char **argv) +{ + int ch; + struct sockaddr_in sin; + int port = 60000; + struct phttpd_global pg; + struct nm_garg nmg, *g; + int error = 0; + + bzero(&nmg, sizeof(nmg)); + nmg.nmr_config = NULL; + nmg.nthreads = 1; + nmg.polltimeo = 2000; + nmg.dev_type = DEV_SOCKET; + nmg.td_type = TD_TYPE_OTHER; + nmg.targ_opaque_len = sizeof(struct dbctx); + nmg.ring_objsize = RING_OBJSIZE; + + nmg.thread = phttpd_thread; + nmg.read = phttpd_read; + + bzero(&pg, sizeof(pg)); + pg.msglen = 64; + + while ((ch = getopt(argc, argv, + "P:l:b:md:Di:cC:a:p:xL:BFe:hN")) != -1) { + switch (ch) { + default: + D("bad option %c %s", ch, optarg); + usage(); + break; + case 'h': + usage(); + break; + case 'P': /* server port */ + port = atoi(optarg); + break; + case 'l': /* HTTP OK content length */ + pg.msglen = atoi(optarg); + break; + case 'b': /* give the epoll_wait() timeo argument -1 */ + nmg.polltimeo = atoi(optarg); + break; + case 'd': /* directory of data store */ + { + pg.dba.dir = optarg; + if (optarg[strlen(optarg) - 1] == '/') + optarg[strlen(optarg) - 1] = '\0'; + if (strstr(optarg, "pm")) // XXX + pg.dba.flags |= DF_PMEM; + } + break; + case 'L': + pg.dba.size = atoll(optarg) * 1000000; // given in MB + break; + case 'm': + pg.dba.flags |= DF_MMAP; + break; + case 'D': + pg.dba.flags |= DF_FDSYNC; + break; + case 'i': + nmg.dev_type = DEV_NETMAP; + if (sizeof(pg.ifname) < strlen(optarg) + 1) + break; + strncpy(pg.ifname, optarg, sizeof(pg.ifname)); + nmg.read = NULL; + nmg.data = phttpd_data; + break; + case 'x': /* PASTE */ + pg.dba.flags |= DF_PASTE; + break; + case 'c': + pg.httplen = 1; + break; + case 'a': + nmg.affinity = atoi(optarg); + break; + case 'p': + nmg.nthreads = atoi(optarg); + break; + case 'C': + nmg.nmr_config = strdup(optarg); + break; +#ifdef WITH_BPLUS + case 'B': + pg.dba.flags |= DF_BPLUS; + break; +#endif /* WITH_BPLUS */ +#ifdef WITH_LEVELDB + case 'N': + pg.dba.flags |= DF_LEVELDB; + break; +#endif /* WITH_LEVELDB */ +#ifdef WITH_NOFLUSH + case 'F': // just to tell the script to use phttpd-f + break; +#endif /* WITH_NOFLUSH */ +#ifdef WITH_CLFLUSHOPT + case 'e': + nmg.emu_delay = atoi(optarg); + break; +#endif /* WITH_CLFLUSHOPT */ + } + + } + + clean_dir(pg.dba.dir); + + fprintf(stderr, "%s built %s %s db: %s\n", argv[0], __DATE__, __TIME__, + pg.dba.dir ? pg.dba.dir : "none"); + usleep(1000); + + argc -= optind; + argv += optind; + + /* + * Check invariants + */ + if (!port || !pg.msglen) + usage(); + else if (pg.dba.flags & DF_PASTE && strlen(pg.ifname) == 0) + usage(); +#ifdef WITH_BPLUS + else if (pg.dba.flags & DF_BPLUS && !(pg.dba.flags & DF_MMAP)) + usage(); +#endif /* WITH_BPLUS */ +#ifdef WITH_LEVELDB + else if (pg.dba.flags & DF_LEVELDB) { + if (pg.dba.flags & (DF_BPLUS|DF_PASTE|DF_MMAP|DF_FDSYNC)) + usage(); + } +#endif /* WITH_LEVELDB */ + + if (pg.dba.flags & DF_MMAP) { + char *path = chkpath(pg.dba.dir, DATAFILE, pg.dba.size, 1); + /* early check for storage space */ + if (!path) { + goto close_socket; + } + free(path); + /* allocate space for extmem */ + if (pg.dba.flags & DF_PASTE) { + /* 32B metadata per netmap buffer */ + u_int n = pg.dba.size / NETMAP_BUF_SIZE; + nmg.extmem_siz = pg.dba.size - 32 * n; + pg.dba.size -= nmg.extmem_siz; + nmg.extmem = chkpath(pg.dba.dir, EXTMEMFILE, + nmg.extmem_siz, 0); + if (nmg.extmem == NULL) + goto close_socket; + } + } +#ifdef __FreeBSD__ + /* kevent requires struct timespec for timeout */ + if (nmg.dev_type != DEV_NETMAP && nmg.polltimeo >= 0) { + struct timespec *x = calloc(1, sizeof(*x)); + if (!x) { + perror("calloc"); + usage(); + } + x->tv_sec = nmg.polltimeo / 1000; + x->tv_nsec = (nmg.polltimeo % 1000) * 1000000; + nmg.polltimeo_ts = x; + } +#endif /* FreeBSD */ + + /* Preallocate HTTP header */ + if (pg.httplen) { + pg.http = (char *)calloc(1, MAX_HTTPLEN); + if (!pg.http) { + perror("calloc"); + usage(); + } + pg.httplen = generate_httphdr(pg.msglen, pg.http); + } + pg.sd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (pg.sd < 0) { + perror("socket"); + return 0; + } + if (do_setsockopt(pg.sd)) { + goto close_socket; + } + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + if (bind(pg.sd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + goto close_socket; + } + if (listen(pg.sd, SOMAXCONN) != 0) { + perror("listen"); + goto close_socket; + } + + netmap_eventloop(PST_NAME, pg.ifname, (void **)&g, &error, + &pg.sd, 1, NULL, 0, &nmg, &pg); + + free_if_exist(nmg.nmr_config); + +close_socket: + if (pg.extmemfd) { + if (nmg.extmem) { + munmap(nmg.extmem, nmg.extmem_siz); + free(nmg.extmem); + } + close(pg.extmemfd); + } + + if (pg.sd > 0) { + close(pg.sd); + } + free_if_exist(pg.http); +#ifdef __FreeBSD__ + free_if_exist(nmg.polltimeo_ts); +#endif + return 0; +} diff --git a/apps/phttpd/tree_test.c b/apps/phttpd/tree_test.c new file mode 100644 index 000000000..be1f729af --- /dev/null +++ b/apps/phttpd/tree_test.c @@ -0,0 +1,72 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include + +//#define TESTS 1 +#define TESTS 100000 + +btree_key record[TESTS]; + +void verify_existence (btree_key, int); + +int main (int argc, char *argv[]) +{ + gfile_t *vp; + btree_key key; + TREE_TYPE datum; + int rc; + int i; + int unique = 0; + long seed; + + seed = time (0); + printf ("Using seed %d\n", (int) seed); + srand (seed); + + rc = btree_create_btree (argv[1], &vp); + + for (i = 0; i < TESTS; ++i) + { + key = rand () % (10 * TESTS); + //key = 0xffffff0000000001; + key = key << 32; + printf("key %lu\n", key); + record[i] = key; + rc = btree_insert (vp, key, key); + if (rc == 0) + ++unique; + else + verify_existence (key, i); + } + + for (i = 0; i < TESTS; ++i) + { + rc = btree_lookup (vp, record[i], &datum); + assert (rc == 0); + assert (record[i] == datum); + } + + printf ("Inserted %d unique items.\n", unique); + + return 0; +} + +void verify_existence (btree_key key, int from) +{ + int i; + + for (i = from - 1; i >= 0; --i) + if (record[i] == key) + return; + + assert (0); + + return; +} diff --git a/libnetmap/nmport.c b/libnetmap/nmport.c index c9c34059e..fa858d163 100644 --- a/libnetmap/nmport.c +++ b/libnetmap/nmport.c @@ -522,6 +522,7 @@ nmport_enable_option(const char *opt) struct nmreq_opt_parser *p; for (p = nmport_opt_parsers; p != NULL; p = p->next) { + if (!strcmp(p->prefix, opt)) { p->flags &= ~NMREQ_OPTF_DISABLED; return 0; diff --git a/libnetmap/nmreq.c b/libnetmap/nmreq.c index 8df02aefa..d7a65e2b7 100644 --- a/libnetmap/nmreq.c +++ b/libnetmap/nmreq.c @@ -88,6 +88,7 @@ struct nmreq_prefix { static struct nmreq_prefix nmreq_prefixes[] = { declprefix("netmap", NR_P_SKIP), declprefix(NM_BDG_NAME, NR_P_ID|NR_P_EMPTYID), + declprefix(NM_PST_NAME, NR_P_ID|NR_P_EMPTYID), { NULL } /* terminate the list */ }; @@ -346,6 +347,9 @@ nmreq_register_decode(const char **pifname, struct nmreq_register *r, struct nmc case 'T': nr_flags |= NR_TX_RINGS_ONLY; break; + case 'V': + nr_flags |= NR_ACCEPT_VNET_HDR; + break; default: nmctx_ferror(ctx, "unrecognized flag: '%c'", *scan); goto fail; diff --git a/share/man/man4/paste.4 b/share/man/man4/paste.4 new file mode 100644 index 000000000..5da7ccb93 --- /dev/null +++ b/share/man/man4/paste.4 @@ -0,0 +1,303 @@ +.\" Copyright (c) 2021 Michio Honda +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" This document is derived in part from the enet man page (enet.4) +.\" distributed with 4.3BSD Unix. +.\" +.\" $FreeBSD$ +.\" +.Dd April 2, 2021 +.Dt PASTE 4 +.Os +.Sh NAME +.Nm paste +.Nd a fast TCP/IP networking using the netmap API +.Sh SYNOPSIS +.Cd device netmap +.Sh DESCRIPTION +.Nm +is a feature of the +.Xr netmap 4 +that allows the applications to use the host TCP/IP stack over the netmap API. +The resulting system call and I/O batching, across multiple connections, and +zero copy enable high throughput and connection scalability. +.Nm paste +also supports zero copy data transfer to or from persistent memory (PM) with the +.Nm extmem +feature. +.Ss NAMING +.Nm +ports are named +.Pa pstSSS:PPP +where +.Pa pst +indicates the +.Nm +port type and +.Pa SSS +indicates a data path. +.Pa PPP +names a port. The same prefix, separated by the colon, +is used to associate a NIC port with the same data path. +.Ss SEMANTICS +Unlike regular port types, a +.Nm +port transmits and receives application-level data. +Every slot thus contains a file descriptor and data offset at which +the transport protocol header terminates; this offset is additional to +the generic offset obtained by NETMAP_ROFFSET(). +On RX the kernel sets these slot attributes whereas the application does so for +TX slots. A TX or RX ring may contain slots whose buffers belong to different file +descriptors. + +On RX, the applications see an in-order TCP stream across the multiple +slots indicated by the same file descriptor; the kernel never sets the buffers +of out-of-order segments in an RX ring of the +.Nm +port. + +To read or write the file descriptor and data offset in a slot, use the utility +functions described later. + +A typical application would use 66 byte data offset for ethernet and IPv4 +headers, and TCP +header with the 12 byte timestamp option that is usually attached to every TCP +segment. +If the headroom does not match the length of the actual protocol +headers, the kernel shifts the application data in the buffer at the expense of +data copy overhead. + +.Nm +port is gone after the process lifetime and the associated NIC port is also +released, unless one or more registered sockets are alive. +.\" +.Ss CONTROL +.Nm +introduces two +.Xr ioctl 2 +subcommands to +.Xr netmap 4 +ports, indicated in the +.Pa nr_reqtype +field of +.Pa struct nmreq_header . +. +.Bl -tag -width XXX +.It Dv NETMAP_REQ_PST_ATTACH +attaches a NIC to the port. +.Pa nr_body +points to +.Pa struct nmreq_vale_attach : +.Bd -literal +struct nmreq_vale_attach { + struct nmreq_register reg; + uint32_t port_index; + uint32_t pad1; +}; +.Ed +.Pp +.Pa nr_mem_id +in +.Pa reg +must be identical to that of +.Nm +port, which can be found in the +.Pa struct nmport_d +structure filled by +.Pa nmport_open_desc() . +. +.It Dv NETMAP_REQ_PST_FD_REG +associates a file descriptor to a +.Nm +port. +.Pa nr_body +points to +.Pa struct nmreq_pst_fd_reg : +.Bd -literal +struct nmreq_pst_fd_reg { + int32_t fd; + int32_t pad; +}; +.Ed +.Pp +.Pa fd +is an accepted TCP socket. +.Xr listen 2 +socket should not be registered. +Although currently only TCP is supported, UDP support will be coming soon. + +.El +.\" +See +.Xr netmap 4 +for the general +.Pa struct nmreq_header +format. +.\" +.Ss UTILITY FUNCTIONS +Four macros are available to read or set the file descriptor or data offset in the slot. +.\" +.Bl -ohang +.It Ft int Fn nm_pst_getfd slot +Get the file descriptor embedded in +.Fa slot . +.It Ft void Fn nm_pst_setfd slot fd +Set the file descriptor +.Fa fd +to the +.Fa slot . +.It Ft int Fn nm_pst_getdoff slot +Get the additional offset embedded in +.Fa slot . +.It Ft void Fn nm_pst_setdoff slot doff +Set the additional offset +.Fa doff +in +.Fa slot . +.El +.\" +.Sh EXAMPLES +.Ss TEST PROGRAM +.Pa phttpd +is an HTTP server that supports +.Nm . +General HTTP benchmark tools, such as +.Nm wrk , +can be used as the client. +.\" +.Ss BASIC USAGE +Typical TCP server code look like following. +.Pp +.Bd -literal -compact + + ... +#include + ... + +const u_int DOFF = 66; /* Ether/TCP/IP + timestamp */ + +void write_txring(char *p, int len, int fd, struct netmap_ring *ring) +{ + u_int cur = ring->cur; + struct netmap_slot *txslot = &ring->slot[cur]; + + buf = NETMAP_BUF_OFFSET(ring, slot->buf_idx); + memcpy(buf + DOFF, p, len); + + slot->len = len + DOFF; + nm_pst_setdoff(slot, DOFF); + nm_pst_setfd(slot, fd); + ring->head = ring->cur = nm_ring_next(cur); +} + +void tcp_server(void) +{ + char *name = "pst:0"; + char *nic_name = "pst:em0"; + struct nmport_d *nmd; + struct nmreq_header hdr; + struct nmreq_vale_attach reg; // reuse + struct pollfd fds[2]; + + /* open and listen socket */ + sd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + bind(sd, ...); + listen(sd, 0); + + /* open a paste port */ + nmport_enable_option("offset"); + nmd = nmport_prepare(name); + nmport_open_desc(nmd); + + /* attach a NIC */ + bzero(&hdr, sizeof(hdr)); + hdr.nr_version = NETMAP_API; + hdr.nr_reqtype = NETMAP_REQ_PST_ATTACH; + hdr.nr_body = (uintptr_t)® + bzero(®, sizeof(reg)); + reg.reg.nr_mem_id = nmd->reg.nr_mem_id; + reg.reg.nr_mode = NR_REG_NIC_SW; + ioctl(nmd->fd, NIOCCTRL, &hdr); + + fds.[0].fd = nmd->fd; + fds.[0].events = POLLIN; + fds.[1].fd = sd; + fds.[1].events = POLLIN; + for (;;) { + /* monitor netmap and listen descriptor */ + poll(&fds, 2, 2000 /* use 0 for busy polling */); + if (fds[1].revents & POLLIN) { /* accept and associate new fd */ + struct nmreq_header hdr2; + struct nmreq_pst_fd_reg fdr; + + newfd = accept(fds[1].fd, &client, &len); + + hdr2 = hdr; // reuse the name + hdr2.nr_reqtype = NETMAP_REQ_PST_FD_REG; + fdr.fd = newfd; + hdr2.nr_body = (uintptr_t)&fdr; + ioctl(nmd->fd, NIOCCTRL, &hdr); + } + if (fds[0].revents & POLLIN) { + struct netmap_if *nifp = nmd->nifp; + struct netmap_ring *rxr = NETMAP_RXRING(nmd->nifp, 0); + struct netmap_ring *txr = NETMAP_TXRING(nmd->nifp, 0); + + while (!nm_ring_empty(ring)) { + int i = ring->cur; + struct netmap_slot *slot = ring->slot[i]; + char *buf = NETMAP_BUF_OFFSET(ring, slot->buf_idx); + + buf += nm_pst_getdoff(slot); + ... consume data on buf ... + ... we have also prepared something to transmit on buf ... + write_txring(buf, len, nm_pst_getfd(slot), txr); + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + } +} +.Ed +.Pp +Note as with other +.Nm netmap ports , +at this point checksum and segmentation offloading features +must be disabled in FreeBSD. However, in Linux, checksum offload must be enabled +via ethtool, although it does not actually effect unless the patch described in +netmap_paste.c is applied. +.\" +.Sh SEE ALSO +.Xr vale 4 , +.Xr netmap 4 +.Pp +Michio Honda, Giuseppe Lettieri, Lars Eggert, +Douglas Santry, +PASTE: A Network Programming Interface for Non-Volatile +Main Memory, USENIX NSDI 2018 +.Pp +.\" +.Sh AUTHOR +.An -nosplit +.An Michio Honda . +.\" diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index bea136c76..9da0ad018 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1620,6 +1620,14 @@ netmap_get_na(struct nmreq_header *hdr, if (*na != NULL) /* valid match in netmap_get_bdg_na() */ goto out; + /* try to see if this is a stack port */ + error = netmap_get_pst_na(hdr, na, nmd, create); + if (error) + goto out; + + if (*na != NULL) /* valid match (same as vale) */ + goto out; + /* * This must be a hardware na, lookup the name in the system. * Note that by hardware we actually mean "it shows up in ifconfig". @@ -2882,7 +2890,7 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, if (req->nr_extra_bufs) { if (netmap_verbose) - nm_prinf("requested %d extra buffers", + nm_prinf("requested %u extra buffers", req->nr_extra_bufs); req->nr_extra_bufs = netmap_extra_alloc(na, &nifp->ni_bufs_head, req->nr_extra_bufs); @@ -3152,6 +3160,31 @@ netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, error = netmap_sync_kloop_stop(priv); break; } +#ifdef WITH_PASTE + case NETMAP_REQ_PST_ATTACH: { + error = netmap_bdg_attach(hdr, NULL /* userspace request */); + break; + } + + case NETMAP_REQ_PST_DETACH: { + error = netmap_bdg_detach(hdr, NULL /* userspace request */); + break; + } + case NETMAP_REQ_PST_FD_REG: { + struct nmreq_pst_fd_reg *fdr = + (struct nmreq_pst_fd_reg *)hdr->nr_body; + NMG_LOCK(); + error = netmap_get_pst_na(hdr, &na, NULL, 0); + NMG_UNLOCK(); + if (!error && na) { + error = netmap_pst_register_fd(na, fdr->fd); + } + if (na) { + netmap_adapter_put(na); + } + break; + } +#endif /* WITH_PASTE */ default: { error = EINVAL; @@ -3272,6 +3305,12 @@ nmreq_size_by_type(uint16_t nr_reqtype) return sizeof(struct nmreq_pools_info); case NETMAP_REQ_SYNC_KLOOP_START: return sizeof(struct nmreq_sync_kloop_start); + case NETMAP_REQ_PST_ATTACH: + return sizeof(struct nmreq_vale_attach); + case NETMAP_REQ_PST_DETACH: + return sizeof(struct nmreq_vale_detach); + case NETMAP_REQ_PST_FD_REG: + return sizeof(struct nmreq_pst_fd_reg); } return 0; } diff --git a/sys/dev/netmap/netmap_bdg.c b/sys/dev/netmap/netmap_bdg.c index 729aee7f6..dc666deb2 100644 --- a/sys/dev/netmap/netmap_bdg.c +++ b/sys/dev/netmap/netmap_bdg.c @@ -119,6 +119,11 @@ netmap_bdg_name(struct netmap_vp_adapter *vp) return b->bdg_basename; } +static int +nm_bdg_reqtype_attach(uint16_t type) +{ + return type == NETMAP_REQ_VALE_ATTACH || type == NETMAP_REQ_PST_ATTACH; +} #ifndef CONFIG_NET_NS /* @@ -354,7 +359,7 @@ netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na) struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; struct nm_bridge *b = vpna->na_bdg; - if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { + if (nm_bdg_reqtype_attach(hdr->nr_reqtype)) { return 0; /* nothing to do */ } if (b) { @@ -484,6 +489,8 @@ netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na, case NETMAP_REQ_VALE_DETACH: case NETMAP_REQ_VALE_POLLING_ENABLE: case NETMAP_REQ_VALE_POLLING_DISABLE: + case NETMAP_REQ_PST_ATTACH: + case NETMAP_REQ_PST_DETACH: break; /* ok */ default: error = EINVAL; @@ -503,7 +510,7 @@ netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na, goto out; vpna = hw->na_vp; hostna = hw->na_hostvp; - if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { + if (nm_bdg_reqtype_attach(hdr->nr_reqtype)) { /* Check if we need to skip the host rings. */ struct nmreq_vale_attach *areq = (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; @@ -580,11 +587,27 @@ netmap_bdg_attach(struct nmreq_header *hdr, void *auth_token) if (error) { /* no device */ goto unlock_exit; } + if (na) { + goto found; + } - if (na == NULL) { /* VALE prefix missing */ + /* check for existing one */ + error = netmap_get_pst_na(hdr, &na, nmd, 0); + if (na) { + error = EBUSY; + goto unref_exit; + } + error = netmap_get_pst_na(hdr, &na, + nmd, 1 /* create if not exists */); + if (error) { /* no device */ + goto unlock_exit; + } + + if (na == NULL) { /* any prefix missing */ error = EINVAL; goto unlock_exit; } +found: if (NETMAP_OWNED_BY_ANY(na)) { error = EBUSY; @@ -658,7 +681,14 @@ netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token) error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */); if (error) { /* no device, or another bridge or user owns the device */ goto error_exit; + } else if (na != NULL) { + goto found; } + error = netmap_get_pst_na(hdr, &na, NULL, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto error_exit; + } +found: if (na == NULL) { /* VALE prefix missing */ error = EINVAL; @@ -1094,7 +1124,7 @@ netmap_vp_reg(struct netmap_adapter *na, int onoff) /* rxsync code used by VALE ports nm_rxsync callback and also * internally by the brwap */ -static int +int netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; @@ -1667,7 +1697,7 @@ netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na) struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; int error = 0; - if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { + if (nm_bdg_reqtype_attach(hdr->nr_reqtype)) { struct nmreq_vale_attach *req = (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; if (req->reg.nr_ringid != 0 || diff --git a/sys/dev/netmap/netmap_bdg.h b/sys/dev/netmap/netmap_bdg.h index a88eaf11b..2b7595410 100644 --- a/sys/dev/netmap/netmap_bdg.h +++ b/sys/dev/netmap/netmap_bdg.h @@ -181,6 +181,7 @@ int netmap_bwrap_reg(struct netmap_adapter *, int onoff); int netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token); int netmap_vp_reg(struct netmap_adapter *na, int onoff); int netmap_vp_rxsync(struct netmap_kring *kring, int flags); +int netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags); int netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags); int netmap_bwrap_notify(struct netmap_kring *kring, int flags); int netmap_bwrap_attach_common(struct netmap_adapter *na, diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index de3fbaaff..566804761 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -39,6 +39,7 @@ #include /* DEV_MODULE_ORDERED */ #include #include /* kern_ioctl() */ +#include /* cap_rights_t */ #include @@ -53,26 +54,35 @@ #include #include /* sockaddrs */ +#include /* getsock_cap() */ #include #include /* kthread_add() */ #include /* PROC_LOCK() */ #include /* RFNOWAIT */ #include /* sched_bind() */ #include /* mp_maxid */ +#include /* struct uio */ #include /* taskqueue_enqueue(), taskqueue_create(), ... */ #include #include #include /* IFT_ETHER */ #include /* ether_ifdetach */ #include /* LLADDR */ +#include /* netisr_dispatch() */ #include /* bus_dmamap_* */ #include /* in6_cksum_pseudo() */ #include /* in_pseudo(), in_cksum_hdr() */ +#include /* ip_input() */ +#include /* TCP_NODELAY */ +/* just for debugging */ #include #include #include #include +#ifdef WITH_PASTE +#include +#endif /* WITH_PASTE */ /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ @@ -973,6 +983,360 @@ ptn_memdev_shutdown(device_t dev) #endif /* WITH_PTNETMAP */ +#ifdef WITH_PASTE +#include + +/* sockbuf is locked */ +int +nm_os_pst_upcall(NM_SOCK_T *so, void *x, int y) +{ + struct mbuf *m, *n; + struct sockbuf *sb = &so->so_rcv; + struct nmcb *cb; + struct netmap_kring *kring = NULL; + int flags = MSG_DONTWAIT | MSG_EOR; + + if (unlikely(!sbavail(sb))) { + struct pst_so_adapter *soa = pst_so(so); + + /* XXX We need trick to set zero-length buffer */ + if (likely(soa)) { + struct netmap_pst_adapter *pna = + (struct netmap_pst_adapter *)soa->na; + pna->eventso[curcpu] = so; + } + return 0; + } + for (m = sb->sb_mb; m != NULL; m = n) { + struct netmap_slot *slot; +#ifdef PST_MB_RECYCLE + int queued = 0; +#endif + + sbfree(sb, m); + n = m->m_next; + + cb = NMCB(m); + if (unlikely(!nmcb_valid(cb))) { + PST_DBG("invalid cb %p", cb); + goto skip_mfree; + } + kring = nmcb_kring(cb); + if (unlikely(kring == NULL)) { + PST_DBG("no kring cb %p", cb); + goto skip_mfree; + } + slot = nmcb_slot(cb); + if (unlikely(slot == NULL)) { + PST_DBG("no slot"); + goto skip_mfree; + } + nm_pst_setfd(slot, pst_so(so)->fd); + nm_pst_setdoff(slot, + m->m_data - M_START(m) - nm_get_offset(kring, slot)); + pst_fdt_add(cb, kring); +#ifdef PST_MB_RECYCLE + if (unlikely(nmcb_rstate(cb) == MB_QUEUED)) { + queued = 1; + } +#endif /* PST_MB_RECYCLE */ + nmcb_wstate(cb, MB_FTREF); +#ifdef PST_MB_RECYCLE + if (likely(!queued)) { + nm_os_pst_mbuf_data_dtor(m); + kring->tx_pool[1] = m; + continue; + } +#endif /* PST_MB_RECYCLE */ +skip_mfree: + m_free(m); + } + sb->sb_mb = NULL; + sb->sb_lastrecord = NULL; + SB_EMPTY_FIXUP(sb); + /* taken from soreceive_stream() */ + if (paste_usrrcv && (so->so_proto->pr_flags & PR_WANTRCVD) && + ((flags & MSG_WAITALL) || !(flags & MSG_SOCALLBCK))) { + PST_DBG("WAITALL %d SOCALLBCK %d", + flags & MSG_WAITALL, flags & MSG_SOCALLBCK); + SOCKBUF_UNLOCK(sb); + (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); + SOCKBUF_LOCK(sb); + } + return 0; +} + +NM_SOCK_T * +nm_os_sock_fget(int fd, void **f) +{ + int err; + cap_rights_t rights; + struct file *fp; + u_int fflag; + + err = getsock_cap(curthread, fd, cap_rights_init(&rights, CAP_IOCTL), + &fp, &fflag, NULL); + *f = fp; + return err ? NULL : fp->f_data; +} + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) +void +nm_os_sock_fput(NM_SOCK_T *so, void *f) +{ + fdrop((struct file *)f, curthread); +} + +int +nm_os_pst_sbdrain(struct netmap_adapter *na, NM_SOCK_T *so) +{ + struct mbuf * m; + struct nmcb *cb; + int error = 0; + + SOCKBUF_LOCK(&so->so_rcv); + if (!sbavail(&so->so_rcv)) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + error = ENOTCONN; + } + SOCKBUF_UNLOCK(&so->so_rcv); + return error; + } + m = so->so_rcv.sb_mb; + cb = NMCB_EXT(m, 0, NETMAP_BUF_SIZE(na)); + if (!nmcb_valid(cb)) { + PST_DBG("invalid cb"); + SOCKBUF_UNLOCK(&so->so_rcv); + return error; + } + error = nm_os_pst_upcall(so, NULL, 0); + SOCKBUF_UNLOCK(&so->so_rcv); + return error; +} + +int +nm_os_pst_mbuf_extadj(struct mbuf *m, int i, int off) +{ + struct mbuf *n; + int cnt = 0; + + for (n = m; n; n = n->m_next) { + if (n->m_flags & M_PKTHDR) + continue; + if (cnt++ > i) + break; + n->m_data += off; + return 0; + } + return -1; +} + +void +nm_os_pst_mbuf_data_dtor(struct mbuf *m) +{ + return pst_mbuf_data_dtor(NMCB(m)); +} + +int +nm_os_sock_dead(NM_SOCK_T *so) +{ + return 0; +} + +#ifdef PST_MB_RECYCLE +static inline int +nm_os_mbuf_valid(struct mbuf *m) +{ + return (likely(m->m_flags & M_EXT) && + likely(m->m_ext.ext_flags & EXT_FLAG_EMBREF)); +} +#endif /* PST_MB_RECYCLE */ + +static struct mbuf * +maybe_new_mbuf(struct netmap_kring *kring) +{ + struct netmap_adapter *na = kring->na; + struct mbuf *m; + +#ifdef PST_MB_RECYCLE + m = kring->tx_pool[1]; + if (m) { + kring->tx_pool[1] = NULL; + *m = *kring->tx_pool[0]; + } else +#endif + m = nm_os_get_mbuf(na->ifp, NETMAP_BUF_SIZE(na)); + if (unlikely(!m)) + return NULL; +#ifdef PST_MB_RECYCLE + else if (unlikely(!nm_os_mbuf_valid(kring->tx_pool[0]))) { + *kring->tx_pool[0] = *m; + } +#endif /* PST_MB_RECYCLE */ + return m; +} + +int +nm_os_pst_rx(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct netmap_adapter *na = kring->na; + struct ifnet *ifp = na->ifp; + char *nmb = NMB(na, slot); + struct nmcb *cb = NMCB_BUF(nmb); + struct mbuf *m; + int ret = 0; + struct epoch_tracker et; + struct netmap_pst_adapter *pna = + (struct netmap_pst_adapter *)pst_na(na); + pna->eventso[curcpu] = NULL; + + m = maybe_new_mbuf(kring); + if (unlikely(m == NULL)) { + return 0; // drop and skip + } + m->m_ext.ext_buf = nmb; + m->m_ext.ext_size = NETMAP_BUF_SIZE(na); + m->m_ext.ext_free = nm_os_pst_mbuf_data_dtor; + m->m_ext.ext_arg2 = NULL; + m->m_len = m->m_pkthdr.len = slot->len; + m->m_pkthdr.flowid = kring->ring_id; + m->m_pkthdr.rcvif = ifp; + m->m_data = nmb + nm_get_offset(kring, slot); + nmcbw(cb, kring, slot); + nmcb_wstate(cb, MB_STACK); + nm_pst_setfd(slot, 0); + + pst_get_extra_ref(kring); + + if (ntohs(*(uint16_t *)((char *)m->m_data + 12)) == ETHERTYPE_IP) { + CURVNET_SET_QUIET(ifp->if_vnet); + M_SETFIB(m, ifp->if_fib); + m_clrprotoflags(m); + m_adj(m, ETHER_HDR_LEN); + //netisr_dispatch(NETISR_IP, m); + NET_EPOCH_ENTER(et); + ip_input(m); + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); + } else { + NET_EPOCH_ENTER(et); + na->if_input(ifp, m); + NET_EPOCH_EXIT(et); + } + + /* + * The buffer might have triggered the socket upcall without + * passing the mbuf. + */ + if (unlikely(pna->eventso[curcpu] != NULL)) { + NM_SOCK_T *so = pna->eventso[curcpu]; + struct pst_so_adapter *soa = pst_so(so); + + pna->eventso[curcpu] = NULL; + /* ignore additional empty upcall */ + if (nm_pst_getfd(slot) == 0) { + /* NULL soa when the soupcall() context closed it */ + if (soa != NULL && nmcb_rstate(cb) == MB_NOREF) { + nm_pst_setfd(slot, soa->fd); + nm_pst_setdoff(slot, 0); + pst_fdt_add(cb, kring); + } + } + } + + if (unlikely(nmcb_rstate(cb) == MB_STACK)) { + nmcb_wstate(cb, MB_QUEUED); + if (unlikely(pst_extra_enq(kring, slot))) { + ret = -EBUSY; + } + } + return 0; +} + +int +nm_os_pst_tx(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct netmap_adapter *na = kring->na; + struct pst_so_adapter *soa; + struct mbuf *m; + char *nmb = NMB(na, slot); + int err; + struct nmcb *cb = NMCB_BUF(nmb); + const int flags = MSG_DONTWAIT | MSG_DONTROUTE; + const u_int pst_offset = nm_pst_getdoff(slot); + const u_int nm_offset = nm_get_offset(kring, slot); + + soa = pst_soa_from_fd(na, nm_pst_getfd(slot)); + if (unlikely(!soa)) { + PST_DBG("no soa (fd %d na %s)", nm_pst_getfd(slot), na->name); + return 0; + } + + /* Link to the external mbuf storage */ + m = nm_os_get_mbuf(na->ifp, NETMAP_BUF_SIZE(na)); + if (unlikely(m == NULL)) { + return 0; // XXX + } + m->m_ext.ext_buf = m->m_data = nmb; + /* + * If we give the entire netmap buffer size, subsequent data can be + * copied to the (unused) trailing space, causing pst_transmit() + * to see the same cb multiple times and only the first one + * with MB_STACK is zero-copied. + */ + m->m_ext.ext_size = slot->len; + m->m_ext.ext_free = nm_os_pst_mbuf_data_dtor; + m->m_len = slot->len - pst_offset; + m->m_data = nmb + nm_offset + pst_offset; + + nmcb_wstate(cb, MB_STACK); + + pst_get_extra_ref(nmcb_kring(cb)); + err = sosend(soa->so, NULL, NULL, m, NULL, flags, curthread); + if (unlikely(err != 0)) { + PST_DBG_LIM("sosend error %d", err); + return -err; + } + + if (unlikely(nmcb_rstate(cb) == MB_STACK)) { + nmcb_wstate(cb, MB_QUEUED); + if (likely(pst_extra_enq(kring, slot))) { + return -EBUSY; + } + } + return 0; +} + +int +nm_os_set_nodelay(NM_SOCK_T *so) +{ + int on = 1; + + return so_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); +} + +int +nm_os_kthread_add(void *f, void *arg, void *proc, struct thread **tdptr, + int flags, int pages, const char *fmt) +{ + return kthread_add(f, arg, proc, tdptr, flags, pages, fmt, ""); +} + +int +nm_os_hwcsum_ok(struct netmap_adapter *na) +{ + return !(if_getcapenable(na->ifp) & IFCAP_HWCSUM); +} + +int +nm_os_so_connected(NM_SOCK_T *so) +{ + return so->so_state & SS_ISCONNECTED; +} +#endif /* WITH_PASTE */ + /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index 87810f7c7..ec86c4dd7 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -989,7 +989,7 @@ generic_netmap_rxsync(struct netmap_kring *kring, int flags) do { struct netmap_slot *slot = ring->slot + nm_i; uint64_t nm_offset = nm_get_offset(kring, slot); - void *nmaddr = NMB(na, slot); + char *nmaddr = NMB(na, slot); if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */ m_freem(m); diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 5d8957241..9092e25db 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -60,6 +60,9 @@ #if defined(CONFIG_NETMAP_SINK) #define WITH_SINK #endif +#if defined(CONFIG_NETMAP_PASTE) +#define WITH_PASTE +#endif #if defined(CONFIG_NETMAP_NULL) #define WITH_NMNULL #endif @@ -77,6 +80,7 @@ #define WITH_MONITOR #define WITH_GENERIC #define WITH_EXTMEM +#define WITH_PASTE #define WITH_NMNULL #endif @@ -292,7 +296,6 @@ struct netmap_adapter; struct nm_bdg_fwd; struct nm_bridge; struct netmap_priv_d; -struct nm_bdg_args; /* os-specific NM_SELINFO_T initialization/destruction functions */ int nm_os_selinfo_init(NM_SELINFO_T *, const char *name); @@ -595,6 +598,11 @@ struct netmap_kring { int (*mon_notify)(struct netmap_kring *kring, int flags); #endif + +#ifdef WITH_PASTE + struct pst_extra_pool *extra; +#endif /* WITH_PASTE */ + } #ifdef _WIN32 __declspec(align(64)); @@ -1190,6 +1198,208 @@ struct netmap_pipe_adapter { #endif /* WITH_PIPES */ +#ifdef WITH_PASTE +#define PST_DBG(format, ...) \ + do { \ + if (netmap_debug & NM_DEBUG_PST) { \ + nm_prinf(format, ##__VA_ARGS__); \ + } \ + } while (0) +#define PST_DBG_LIM(format, ...) \ + do { \ + if (netmap_debug & NM_DEBUG_PST) { \ + nm_prlim(1, format, ##__VA_ARGS__); \ + } \ + } while (0) + +#ifdef __FreeBSD__ +#define MBUF_L3_OFST(m) (m)->m_pkthdr.l2hlen +#define MBUF_L4_OFST(m) (MBUF_L3_OFST(m) + (m)->m_pkthdr.l3hlen) +#define MBUF_L3_HEADER(m) mtodo((m), MBUF_L3_OFST(m)) +#define MBUF_L4_HEADER(m) mtodo((m), MBUF_L4_OFST(m)) +#define MBUF_HASNEXT(m) (m->m_next != NULL) +#define MBUF_FLATTEN(m) // XXX +#define MBUF_DATA(m) (m)->m_data +#define MBUF_HDRLEN(m) ((m)->m_flags & M_PKTHDR ? (m)->m_len : 0) +#define MBUF_CSUM_DONE(m) + +#define NM_SOCK_T struct socket +#define SAVE_SOUPCALL(so, soa) +#define RESTORE_SOUPCALL(so, soa) soupcall_clear(so, SO_RCV) +#define SAVE_SODTOR(so, soa) (soa)->save_sodtor = (so)->so_dtor +#define RESTORE_SODTOR(so, soa) (so)->so_dtor = (soa)->save_sodtor /* no lock */ +#define SET_SOUPCALL(so, f) soupcall_set(so, SO_RCV, f, NULL) +#define SET_SODTOR(so, f) sodtor_set(so, f) + +struct nm_ubuf_info { + void *ctx; + void *desc; +}; + +#define nm_os_sock_set_nocoalesce(_sb) (_sb)->sb_flags |= SB_NOCOALESCE +int nm_os_pst_upcall(NM_SOCK_T *, void *, int); +void nm_os_pst_mbuf_data_dtor(struct mbuf *); +#include /* struct socket */ +#define NMCB(_m) ((struct nmcb *)M_START(_m)) +#define NMCB_BUF(_buf) ((struct nmcb *)(_buf)) +#define NMCB_EXT(_m, _i, _bufsiz) \ + NMCB_BUF((_m)->m_ext.ext_buf) + +#define nmcb_kring(cb) ((struct netmap_kring *)(cb)->ui.ctx) +#define nmcb_slot(cb) ((struct netmap_slot *)(cb)->ui.desc) +#define nmcbw(cb, kring, slot) do {\ + (cb)->ui.ctx = (kring);\ + (cb)->ui.desc = (slot);\ +} while (0) +#else /* __FreeBSD__ */ +void nm_os_pst_upcall(NM_SOCK_T *); +netdev_tx_t linux_pst_start_xmit(struct mbuf *, struct ifnet *); +#if defined(NETMAP_LINUX_UBUF_INFO_CALLBACK_3ARGS) +void nm_os_pst_mbuf_data_dtor(struct sk_buff *, struct ubuf_info *, bool); +#else +void nm_os_pst_mbuf_data_dtor(struct ubuf_info *, bool); +#endif +void nm_os_set_mbuf_data_destructor(struct mbuf *, struct nm_ubuf_info *, void *); +#endif +#define NMCB_SLT(_na, _slt) NMCB_BUF(NMB(_na, (_slt))) + +#define PST_MB_RECYCLE +struct pst_extra_pool; + +/* to be embedded in the buf */ +/* struct skb_shared_info takes 320 byte so far. + * Just for the case we would keep occupancy to 1600 Byte before this + * We have budget of 40 byte for each of msghdr and cb + * after 1520 data+headroom + */ +enum { + MB_STACK=1, + MB_QUEUED, + MB_TXREF, + MB_FTREF, + MB_NOREF, +}; + +struct nmcb { + struct nm_ubuf_info ui; /* ctx keeps kring and desc keeps slot */ +#define MB_MAGIC 0x12345600 /* XXX do better */ +#define MB_MAGIC_MASK 0xffffff00 /* XXX do better */ + uint32_t flags; + uint32_t next; + uint32_t cmd; + uint32_t off; +} __attribute__((__packed__)); /* 32 byte */ + +static inline void +nmcb_wstate(struct nmcb *cb, u_int newstate) +{ + cb->flags = (MB_MAGIC | newstate); +} + +static inline void +nmcb_invalidate(struct nmcb *cb) +{ + cb->flags = 0; +} + +static inline int +nmcb_valid(struct nmcb *cb) +{ + return ((cb->flags & MB_MAGIC_MASK) == MB_MAGIC); +} + +static inline int +nmcb_rstate(struct nmcb *cb) +{ + return likely(nmcb_valid(cb)) ? + (cb->flags & ~MB_MAGIC_MASK) : 0; +} + +struct pst_so_adapter { + NM_SOCK_T *so; + int32_t fd; + /* 32 bit hole */ + struct netmap_adapter *na; +#ifdef linux + void (*save_soupcall)(NM_SOCK_T *); + void (*save_sodtor)(NM_SOCK_T *); +#else + int (*save_soupcall)(NM_SOCK_T *, void *, int); + void (*save_sodtor)(NM_SOCK_T *); +#endif +}; + +struct netmap_pst_adapter { + struct netmap_vp_adapter up; + int (*save_reg)(struct netmap_adapter *na, int onoff); +#ifdef linux + struct net_device_ops stack_ndo; +#endif /* linux */ + struct pst_so_adapter **so_adapters; +#define DEFAULT_SK_ADAPTERS 65535 + u_int so_adapters_max; + u_int num_so_adapters; + NM_LOCK_T so_adapters_lock; +#ifdef __FreeBSD__ + void *eventso[128]; + struct thread *kwaittdp; +#else + struct thread *kwaittdp; +#endif + struct netmap_priv_d *kpriv; +}; + +#ifdef __FreeBSD__ +static inline struct pst_so_adapter * +pst_so(NM_SOCK_T *so) +{ + return (struct pst_so_adapter *)so->so_emuldata; +} + +static inline void +pst_wso(struct pst_so_adapter *soa, NM_SOCK_T *so) +{ + so->so_emuldata = (void *)soa; +} + +#else +/* bsd_glue.h for Linux */ +#endif /* __FreeBSD__ */ + +struct netmap_adapter *pst_na(const struct netmap_adapter *slave); +int netmap_pst_register_fd(struct netmap_adapter *na, int fd); +struct pst_so_adapter * pst_soa_from_fd(struct netmap_adapter *, int); +int pst_extra_enq(struct netmap_kring *, struct netmap_slot *); +void pst_extra_deq(struct netmap_kring *, struct netmap_slot *); +void pst_fdt_add(struct nmcb *, struct netmap_kring *); +int netmap_pst_transmit(struct ifnet *, struct mbuf *); +void pst_mbuf_data_dtor(struct nmcb *); +void pst_get_extra_ref(struct netmap_kring *); +void pst_put_extra_ref(struct netmap_kring *); +u_int pst_peek_extra_ref(struct netmap_kring *); +extern int paste_usrrcv; +extern int paste_optim_sendpage; +int netmap_get_pst_na(struct nmreq_header *hdr, struct netmap_adapter **na, + struct netmap_mem_d *nmd, int create); +/* + * non-static functions used by netmap_linux.c + */ +int nm_os_sock_dead(NM_SOCK_T *); +NM_SOCK_T *nm_os_sock_fget(int, void **); +void nm_os_sock_fput(NM_SOCK_T *, void *); +int nm_os_pst_sbdrain(struct netmap_adapter *, NM_SOCK_T *); +int nm_os_pst_mbuf_extadj(struct mbuf *, int, int); +int nm_os_pst_rx(struct netmap_kring *, struct netmap_slot *); +int nm_os_pst_tx(struct netmap_kring *, struct netmap_slot *); +int nm_os_set_nodelay(NM_SOCK_T *); +int nm_os_kthread_add(void *, void *, void *, struct thread **, int, int, const char *); +int nm_os_hwcsum_ok(struct netmap_adapter *); +int nm_os_so_connected(NM_SOCK_T *); + +#else /* !WITH_PASTE */ +#define netmap_get_pst_na(_1, _2, _3, _4) 0 +#endif /* !WITH_PASTE */ + #ifdef WITH_NMNULL struct netmap_null_adapter { struct netmap_adapter up; @@ -1681,6 +1891,7 @@ enum { /* debug flags */ NM_DEBUG_MEM = 0x4000, /* verbose memory allocations/deallocations */ NM_DEBUG_VALE = 0x8000, /* debug messages from memory allocators */ NM_DEBUG_BDG = NM_DEBUG_VALE, + NM_DEBUG_PST = 0x10000, }; extern int netmap_txsync_retry; diff --git a/sys/dev/netmap/netmap_paste.c b/sys/dev/netmap/netmap_paste.c new file mode 100644 index 000000000..6815ffc62 --- /dev/null +++ b/sys/dev/netmap/netmap_paste.c @@ -0,0 +1,1677 @@ +/* + * Copyright (C) 2018 Michio Honda + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(__FreeBSD__) +#include /* defines used in kernel.h */ +#include /* struct socket */ +#include /* sockaddrs */ +#include +#include +#include +#include /* struct ether_header */ +#include /* IPPROTO_UDP */ +#include /* bus_dmamap_* */ +#include /* kthread_add() */ + +#elif defined(linux) +#include "bsd_glue.h" +#define ENOTSUP ENOTSUPP +#else +#error Unsupported platform +#endif /* unsupported */ + +#include +#include +#include +#include + +#ifdef WITH_PASTE +#include + +int paste_host_batch = 1; +static int paste_extra = 2048; +int paste_usrrcv = 0; +int paste_optim_sendpage = 0; +SYSBEGIN(vars_paste); +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, paste_host_batch, CTLFLAG_RW, &paste_host_batch, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, paste_extra, CTLFLAG_RW, &paste_extra, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, paste_usrrcv, CTLFLAG_RW, &paste_usrrcv, 1 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, paste_optim_sendpage, CTLFLAG_RW, &paste_optim_sendpage, 1 , ""); +SYSEND; + +static int netmap_pst_bwrap_intr_notify(struct netmap_kring *kring, int flags); +static inline void +pst_swap(struct netmap_slot *s, struct netmap_slot *d) +{ + struct netmap_slot tmp = *d; + *d = *s; + *s = tmp; + s->flags |= NS_BUF_CHANGED; + d->flags |= NS_BUF_CHANGED; +} + +static inline u_int +rollup(struct netmap_kring *kring, u_int from, u_int to, u_int *n) +{ + u_int i, m = 0, lim = kring->nkr_num_slots - 1; + + for (i = from; i != to; m++) { + struct netmap_slot *slot = &kring->ring->slot[i]; + struct nmcb *cb = NMCB_SLT(kring->na, slot); + + i = nm_next(i, lim); + if (nmcb_valid(cb) && nmcb_rstate(cb) != MB_NOREF) { + i = nm_prev(i, lim); + break; + } + } + if (n) + *n = m; + return i; +} + +static inline struct netmap_pst_adapter * +topna(struct netmap_adapter *na) +{ + return (struct netmap_pst_adapter *)na; +} + +static inline struct netmap_vp_adapter * +tovpna(struct netmap_adapter *na) +{ + return (struct netmap_vp_adapter *)na; +} + +struct netmap_adapter * +pst_na(const struct netmap_adapter *port) +{ + const struct netmap_vp_adapter *vpna = + (const struct netmap_vp_adapter *)port; + return likely(vpna) ? &vpna->na_bdg->bdg_ports[0]->up : NULL; +} + +static inline int +is_host(struct netmap_adapter *na) +{ + return na->nm_register == NULL; +} + +#define for_bdg_ports(i, b) \ + for ((i) = 0; (i) < (b)->bdg_active_ports; (i)++) + +#define NM_PST_MAXRINGS 64 +#define NM_PST_RINGSIZE 1024 +#define NM_PST_MAXSLOTS 2048 +#define NM_PST_FD_MAX 65535 +#define NM_PST_BATCH_MAX 2048 + +/* Buffers sorted by file descriptors */ +struct pst_fdt_q { + uint32_t fq_head; + uint32_t fq_tail; +}; + +struct pst_fdt { + uint16_t nfds; + uint16_t npkts; + struct pst_fdt_q fde[NM_PST_FD_MAX]; + uint32_t tmp[NM_PST_BATCH_MAX]; + uint32_t fds[NM_PST_BATCH_MAX * 4]; +}; +#define NM_FDT_NULL 0 // invalid buf index + +struct pst_extra_slot { + struct netmap_slot slot; + uint16_t prev; + uint16_t next; +}; + +struct pst_extra_pool { + u_int num; + struct pst_extra_slot *slots; + uint32_t free; + uint32_t free_tail; + uint32_t busy; + uint32_t busy_tail; + u_int refcount; +}; + +void +pst_get_extra_ref(struct netmap_kring *kring) +{ + kring->extra->refcount++; +} + +void +pst_put_extra_ref(struct netmap_kring *kring) +{ + kring->extra->refcount--; +} + +u_int +pst_peek_extra_ref(struct netmap_kring *kring) +{ + return kring->extra->refcount; +} + +static int +pst_extra_noref(struct netmap_adapter *na) +{ + struct netmap_adapter *port; + struct nm_bridge *b = tovpna(na)->na_bdg; + int i, j; + + for_bdg_ports(i, b) { + port = &b->bdg_ports[i]->up; + if (is_host(port)) + continue; + for (j = 0; j < nma_get_nrings(port, NR_TX); j++) { + struct netmap_kring *kr = NMR(port, NR_TX)[j]; + + if (pst_peek_extra_ref(kr) > 0) { + struct pst_extra_pool *p; + u_int k, n = 0; + + p = kr->extra; + for (k = p->busy; k != p->busy_tail;) { + k = p->slots[k].next; + n++; + } + PST_DBG("%s ref %d busy slots %d", + kr->name, pst_peek_extra_ref(kr), n); + + return 0; + } + } + } + return 1; +} + +#define NM_EXT_NULL ((uint16_t)~0) +#define EXTRA_APPEND(name, pool, xtra, slots) \ + do { \ + u_int pos = xtra - slots; \ + xtra->next = NM_EXT_NULL; \ + if (pool->name == NM_EXT_NULL) \ + pool->name = pos; \ + else \ + slots[pool->name##_tail].next = pos; \ + xtra->prev = pool->name##_tail; \ + pool->name##_tail = pos; \ + } while (0) \ + +#define BETWEEN(x, l, h) \ + ((uintptr_t)(x) >= (uintptr_t)(l) && (uintptr_t)(x) < (uintptr_t)(l+h)) +void +pst_extra_deq(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct pst_extra_pool *pool; + struct pst_extra_slot *slots, *xtra; + + /* XXX raising mbuf might have been orphaned */ + if (netmap_debug) { + if (kring->nr_mode != NKR_NETMAP_ON) + panic("kring off"); + } + pool = kring->extra; + slots = pool->slots; + /* nothing to do if I am on the ring */ + if (unlikely(!(BETWEEN(slot, slots, pool->num)))) { + PST_DBG_LIM("%s kring %s buf_idx %u not in the extra pool", + kring->na->name, kring->name, slot->buf_idx); + return; + } + + xtra = (struct pst_extra_slot *)slot; + /* remove from busy list */ + if (xtra->next == NM_EXT_NULL) + pool->busy_tail = xtra->prev; // might be NM_EXT_NULL + else + slots[xtra->next].prev = xtra->prev; // might be NM_EXT_NULL + if (xtra->prev == NM_EXT_NULL) + pool->busy = xtra->next; // might be NM_EXT_NULL + else + slots[xtra->prev].next = xtra->next; // might be NM_EXT_NULL + /* append to free list */ + EXTRA_APPEND(free, pool, xtra, slots); +} +#undef BETWEEN + +int +pst_extra_enq(struct netmap_kring *kring, struct netmap_slot *slot) +{ + struct netmap_adapter *na = kring->na; + struct pst_extra_pool *pool = kring->extra; + struct pst_extra_slot *slots = pool->slots, *xtra; + struct nmcb *cb; + + if (unlikely(pool->free_tail == NM_EXT_NULL)) + return EBUSY; + + xtra = &slots[pool->free_tail]; + /* remove from free list */ + pool->free_tail = xtra->prev; + if (unlikely(pool->free_tail == NM_EXT_NULL)) // I'm the last one + pool->free = NM_EXT_NULL; + else + slots[xtra->prev].next = NM_EXT_NULL; + /* append to busy list */ + EXTRA_APPEND(busy, pool, xtra, slots); + + cb = NMCB_SLT(na, slot); + pst_swap(slot, &xtra->slot); + if (netmap_debug) { + if (nmcb_kring(cb) != kring) + panic(" "); + } + nmcbw(cb, nmcb_kring(cb), &xtra->slot); + return 0; +} +#undef EXTRA_APPEND + +static inline struct pst_fdt * +pst_get_fdt(struct netmap_kring *kring) +{ + return (struct pst_fdt *)kring->nkr_ft; +} + +static void +pst_fdt_free(struct netmap_adapter *na) +{ + int i; + + for (i = 0; i < netmap_real_rings(na, NR_TX); i++) { + struct netmap_kring *kring = NMR(na, NR_TX)[i]; + if (kring->nkr_ft) { + nm_os_free(kring->nkr_ft); + kring->nkr_ft = NULL; + } + } +} + +static int +pst_fdt_alloc(struct netmap_adapter *na) +{ + int i; + + for (i = 0; i < netmap_real_rings(na, NR_TX); i++) { + struct pst_fdt *ft = nm_os_malloc(sizeof(struct pst_fdt)); + if (!ft) { + pst_fdt_free(na); + return ENOMEM; + } + NMR(na, NR_TX)[i]->nkr_ft = (struct nm_bdg_fwd *)ft; + } + return 0; +} + +void +pst_fdt_add(struct nmcb *cb, struct netmap_kring *kring) +{ + struct netmap_slot *slot = nmcb_slot(cb); + struct pst_fdt *ft = pst_get_fdt(kring); + struct pst_fdt_q *fde; + + fde = ft->fde + nm_pst_getfd(slot); + + cb->next = NM_FDT_NULL; + if (fde->fq_head == NM_FDT_NULL) { + fde->fq_head = fde->fq_tail = slot->buf_idx; + ft->fds[ft->nfds++] = nm_pst_getfd(slot); + } else { + struct netmap_slot tmp = { fde->fq_tail }; + struct nmcb *prev = NMCB_SLT(kring->na, &tmp); + prev->next = fde->fq_tail = slot->buf_idx; + } + ft->npkts++; +} + +/* XXX should go away */ +static void +pst_fdt_may_reset(struct netmap_kring *kring) +{ + struct pst_fdt *ft = pst_get_fdt(kring); + + if (likely(ft->nfds == 0 && ft->npkts == 0)) + return; + PST_DBG("kring %d fds %d pkts %d", kring->ring_id, ft->nfds, ft->npkts); + if (ft->nfds > 0 && ft->npkts == 0) { + ft->nfds = 0; + } else if (ft->nfds == 0 && ft->npkts > 0) { + int i; + for (i = 0; i < NM_PST_FD_MAX; i++) { + struct pst_fdt_q *fde = ft->fde + i; + if (unlikely(fde->fq_head != NM_FDT_NULL)) { + fde->fq_head = fde->fq_tail = NM_FDT_NULL; + if (--ft->npkts == 0) + break; + } + } + ft->npkts = 0; + } +} + +struct pst_so_adapter * +pst_soa_from_fd(struct netmap_adapter *na, int fd) +{ + struct netmap_pst_adapter *pna = topna(na); + + if (unlikely(fd >= pna->so_adapters_max)) + return NULL; + return pna->so_adapters[fd]; +} + +/* Differ from nm_kr_space() due to different meaning of the lease */ +static inline uint32_t +pst_kr_rxspace(struct netmap_kring *k) +{ + int busy = k->nr_hwtail - k->nkr_hwlease; + + if (busy < 0) + busy += k->nkr_num_slots; + return k->nkr_num_slots - 1 - busy; +} + +static void +pst_poststack(struct netmap_kring *kring) +{ + struct netmap_adapter *na = kring->na, *rxna; + struct nm_bridge *b = tovpna(na)->na_bdg; + struct pst_fdt *ft = pst_get_fdt(kring); + uint32_t *nonfree = ft->tmp; + u_int lim_rx, howmany, nrings; + struct netmap_kring *rxr; + int j, want, sent = 0, nonfree_num = 0; + + if (!BDG_RTRYLOCK(b)) { + return; + } + + if (is_host(na)) { + want = kring->rhead - kring->nr_hwcur; + if (want < 0) + want += kring->nkr_num_slots; + } else { + want = ft->npkts; + } + + /* Now, we know how many packets go to the receiver */ + if (na == pst_na(na) || is_host(na)) { + if (unlikely(b->bdg_ports[1] == NULL)) + goto runlock; + rxna = &b->bdg_ports[1]->up; /* XXX */ + } else { + rxna = pst_na(na); + } + + /* XXX Ugly but we cannot use ring_id on host rings */ + nrings = nma_get_nrings(rxna, NR_RX); + rxr = NMR(rxna, NR_RX)[(kring - NMR(na, NR_TX)[0]) % nrings]; + lim_rx = rxr->nkr_num_slots - 1; + j = rxr->nr_hwtail; + + /* under lock */ + mtx_lock(&rxr->q_lock); + /* no need to check rxr->nkr_stopped */ + + howmany = pst_kr_rxspace(rxr); + if (howmany < want) { // try to reclaim completed buffers + u_int n = 0; + rxr->nkr_hwlease = + rollup(rxr, rxr->nkr_hwlease, rxr->nr_hwtail, &n); + howmany += n; + } else if (likely(want < howmany)) { + howmany = want; + } + if (unlikely(howmany == 0)) { + goto unlock_kring; + } + + if (is_host(na)) { // don't touch buffers, slightly faster + u_int k = kring->nr_hwcur, lim_tx = kring->nkr_num_slots - 1; + while (howmany--) { + struct netmap_slot *rs, *ts = &kring->ring->slot[k]; + rs = &rxr->ring->slot[j]; + pst_swap(ts, rs); + k = nm_next(k, lim_tx); + j = nm_next(j, lim_rx); + sent++; + } + } else { + int n = 0; + while (n < ft->nfds && likely(howmany)) { + int fd = ft->fds[n]; + struct pst_fdt_q *fq = ft->fde + fd; + uint32_t next = fq->fq_head; + while (next != NM_FDT_NULL && likely(howmany)) { + struct netmap_slot tmp = { next }; + struct netmap_slot *ts, *rs; + struct nmcb *cb; + + rs = &rxr->ring->slot[j]; + cb = NMCB_SLT(na, &tmp); + if (likely(nmcb_valid(cb))) { + next = cb->next; + cb->next = NM_FDT_NULL; + } else { + next = NM_FDT_NULL; + goto skip; + } + ts = nmcb_slot(cb); + if (unlikely(ts == NULL)) { + PST_DBG("null ts nxt %u fd %d bufi " + "%u valid %d cb %p ft %d", + next, fd, tmp.buf_idx, + nmcb_valid(cb), cb, + nmcb_rstate(cb) == MB_FTREF); + goto skip; + } else if (unlikely(cb != NMB(na, ts))) { + PST_DBG("fd %d cb %p != nmb %p " + "len %d state %d", fd, cb, + NMB(na, nmcb_slot(cb)), ts->len, + nmcb_rstate(cb) == MB_FTREF); + } + + if (nmcb_rstate(cb) == MB_TXREF) { + nonfree[nonfree_num++] = j; + } + pst_swap(ts, rs); + if (nmcb_rstate(cb) == MB_FTREF) { + nmcb_wstate(cb, MB_NOREF); + pst_extra_deq(nmcb_kring(cb), ts); + } + nmcbw(cb, nmcb_kring(cb), rs);// needed? +skip: + j = nm_next(j, lim_rx); + sent++; + howmany--; + } + if (next == NM_FDT_NULL) { + n++; + fq->fq_tail = NM_FDT_NULL; + } + fq->fq_head = next; // no NULL if howmany has run out + } + ft->nfds -= n; + ft->npkts -= sent; + memmove(ft->fds, ft->fds + n, sizeof(ft->fds[0]) * ft->nfds); + pst_fdt_may_reset(kring); + } + + rxr->nr_hwtail = j; // no update if !sent +unlock_kring: + mtx_unlock(&rxr->q_lock); + + if (likely(sent)) + rxr->nm_notify(rxr, 0); + rxr->nkr_hwlease = rxr->nr_hwcur; + + /* swap out packets still referred by the stack */ + for (j = 0; j < nonfree_num; j++) { + struct netmap_slot *slot = &rxr->ring->slot[nonfree[j]]; + struct nmcb *cb = NMCB_SLT(rxna, slot); + if (unlikely(pst_extra_enq(nmcb_kring(cb), slot))) { + /* Don't reclaim on/after this positon */ + rxr->nkr_hwlease = slot - rxr->ring->slot; + if (netmap_debug) { + if (rxr->nkr_hwlease > rxr->nkr_num_slots) + panic(" "); + } + break; + } + } +runlock: + BDG_RUNLOCK(b); + return; +} + +/* Form fdtable to be flushed */ +static int +pst_prestack(struct netmap_kring *kring) +{ + struct netmap_adapter *na = kring->na; + int k = kring->nr_hwcur; + const u_int lim_tx = kring->nkr_num_slots - 1; + const int rhead = kring->rhead; + const bool tx = na == pst_na(na) ? 1 : 0; + struct pst_fdt *ft = pst_get_fdt(kring); + + if (!tx && is_host(na)) + kring->nkr_hwlease = rhead; // skip loop below + for (k = kring->nkr_hwlease; k != rhead; k = nm_next(k, lim_tx)) { + struct netmap_slot *slot = &kring->ring->slot[k]; + int err; + if (unlikely(slot->len == 0)) { + continue; + } + nmcbw(NMCB_SLT(na, slot), kring, slot); + if (tx) { + const u_int offset = nm_get_offset(kring, slot); + const u_int pst_offset = nm_pst_getdoff(slot); + /* XXX how to inform -EINVAL */ + if (unlikely(slot->len < pst_offset)) { + if (netmap_verbose) + nm_prinf("data offset %u too large", + offset); + break; + } else if (unlikely(offset < sizeof(struct nmcb))) { + if (netmap_verbose) + nm_prinf("offset must be at least %lu", + sizeof(struct nmcb)); + break; + } + err = nm_os_pst_tx(kring, slot); + } else { + err = nm_os_pst_rx(kring, slot); + } + if (unlikely(err)) { + /* + * EBUSY advances the cursor as the stack has consumed + * data (see nm_os_pst_tx()). EINVAL stops that as the + * client is likely misbehaving. + */ + if (err == -EBUSY) { + k = nm_next(k, lim_tx); + } + break; + } + } + kring->nkr_hwlease = k; // next position to process in the stack + pst_poststack(kring); + if (ft->npkts) { // we have leftover, cannot report k + k = rollup(kring, kring->nr_hwcur, k, NULL); + } + return k; +} + +static int +nombq(struct netmap_adapter *na, struct mbuf *m) +{ + struct netmap_kring *kring; + struct netmap_slot *hs; + u_int head, nm_i, lim, len = MBUF_LEN(m); + + /* host ring */ + nm_i = curcpu % nma_get_host_nrings(na, NR_RX); + kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + nm_i]; + head = kring->rhead; + lim = kring->nkr_num_slots - 1; + nm_i = kring->nr_hwtail; + /* check space */ + if (unlikely(nm_i == nm_prev(kring->nr_hwcur, lim))) { + netmap_bwrap_intr_notify(kring, 0); + if (kring->nr_hwtail == nm_prev(kring->nr_hwcur, lim)) { + m_freem(m); + return EBUSY; + } + } else if (unlikely(!nm_netmap_on(na))) { + m_freem(m); + return ENXIO; + } + hs = &kring->ring->slot[nm_i]; + m_copydata(m, 0, len, (char *)NMB(na, hs) + nm_get_offset(kring, hs)); + hs->len = len; + kring->nr_hwtail = nm_next(nm_i, lim); + + nm_i = kring->nr_hwcur; + if (likely(nm_i != head)) { + kring->nr_hwcur = head; + } + if (!paste_host_batch) { + netmap_bwrap_intr_notify(kring, 0); + } + /* as if netmap_transmit + rxsync_from_host done */ + m_freem(m); + return 0; +} + +#ifdef __FreeBSD__ +/* FreeBSD doesn't have protocol header offsets filled */ +static inline void +mbuf_proto_headers(struct mbuf *m) +{ + uint16_t ethertype; + + ethertype = ntohs(*(uint16_t *)(m->m_data + 12)); + if (MBUF_L3_OFST(m) > 0) + return; + m->m_pkthdr.l2hlen = sizeof(struct ether_header); + m->m_pkthdr.l3hlen = sizeof(struct nm_iphdr); +} +#else +#define mbuf_proto_headers(m) +#endif /* !FreeBSD */ + +static void +csum_transmit(struct netmap_adapter *na, struct mbuf *m) +{ + if (nm_os_mbuf_has_csum_offld(m)) { + struct nm_iphdr *iph; + char *th; + uint16_t *check; + mbuf_proto_headers(m); + iph = (struct nm_iphdr *)MBUF_L3_HEADER(m); + th = MBUF_L4_HEADER(m); + if (iph->protocol == IPPROTO_UDP) + check = &((struct nm_udphdr *)th)->check; + else if (likely(iph->protocol == IPPROTO_TCP)) + check = &((struct nm_tcphdr *)th)->check; + else + panic("bad proto %u w/ offld", iph->protocol); + /* With ethtool -K eth1 tx-checksum-ip-generic on, we + * see HWCSUM/IP6CSUM in dev and ip_sum PARTIAL on m. + */ + *check = 0; + nm_os_csum_tcpudp_ipv4(iph, th, + MBUF_LEN(m) - MBUF_L4_OFST(m), check); + //m->ip_summed = 0; + //m->m_pkthdr.csum_flags = CSUM_TSO; // XXX + } + nombq(na, m); +} + +int +netmap_pst_transmit(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + struct nmcb *cb = NULL; + struct netmap_kring *kring; + struct netmap_slot *slot; + char *nmb; + int hole; + const u_int bufsize = NETMAP_BUF_SIZE(na); + u_int poff, doff, mlen; + +#ifdef __FreeBSD__ + /* M_EXT or multiple mbufs (i.e., chain) */ + if ((m->m_flags & M_EXT)) // not TCP case + cb = NMCB_EXT(m, 0, bufsize); + if (!(cb && nmcb_valid(cb))) { // TCP case + if (MBUF_HASNEXT(m) && (m->m_next->m_flags & M_EXT)) { + (void)bufsize; + cb = NMCB_EXT(m->m_next, 0, bufsize); + } + } +#elif defined(linux) + /* txsync-ing TX packets are always frags */ + if (!MBUF_HASNEXT(m)) { + csum_transmit(na, m); + return 0; + } + + cb = NMCB_EXT(m, 0, bufsize); +#endif /* __FreeBSD__ */ + if (unlikely(!(cb && nmcb_valid(cb)))) { + csum_transmit(na, m); + return 0; + } + kring = nmcb_kring(cb); + /* + * Linux pushes down the same cb multiple times for a slot whose + * len > MTU, but the first one clears the MB_STACK state. + */ + if (unlikely(nmcb_rstate(cb) != MB_STACK) +#ifdef __FreeBSD__ + /* FreeBSD ARP reply recycles the RX mbuf */ + || unlikely(kring && kring->na->na_private == na->na_private) +#endif /* __FreeBSD__ */ + ) { + if (netmap_verbose) + nm_prlim(1, "cb %p state %d len %u", cb, + nmcb_rstate(cb), MBUF_LEN(m)); +#ifdef linux + if (unlikely(nmcb_rstate(cb) == MB_QUEUED)) { + nmcb_wstate(NMCB_EXT(m, 0, bufsize), MB_NOREF); + pst_extra_deq(kring, nmcb_slot(cb)); + } +#endif + MBUF_FLATTEN(m); // XXX + csum_transmit(na, m); + return 0; + } + + /* Valid cb, txsync-ing packet. */ + + slot = nmcb_slot(cb); + nmb = NMB(na, slot); + if (unlikely((struct nmcb *)nmb != cb)) { + panic("nmb %p cb %p", nmb, cb); + } + mlen = m_length(m, NULL); + poff = nm_get_offset(kring, slot); + doff = nm_pst_getdoff(slot); + hole = (int)doff - MBUF_HDRLEN(m); + if (!hole) { + /* bring headers in */ + memcpy(nmb + poff, MBUF_DATA(m), doff); + if (netmap_verbose) + nm_prlim(1,"zerocopy: off %u len %u", + poff + doff, slot->len); + } else { + caddr_t p = nmb + poff + doff; + if (netmap_verbose) + nm_prlim(1, "hole %d poff %u doff %u hlen %u mlen %u " + "mclen %u slen %u", + hole, poff, doff, MBUF_HDRLEN(m), MBUF_LEN(m), + mlen, slot->len); + // buffers can overlap + memcpy(p - hole, p, MBUF_LEN(m) - MBUF_HDRLEN(m)); + nm_os_pst_mbuf_extadj(m, 0, -hole); + memcpy(nmb + poff, MBUF_DATA(m), MBUF_HDRLEN(m)); + slot->len -= hole; + nm_pst_setdoff(slot, doff - hole); // inform user + } + if (unlikely(slot->len > mlen)) { + slot->len = mlen; + } + + if (nm_os_mbuf_has_csum_offld(m)) { + struct nm_iphdr *iph; + struct nm_tcphdr *tcph; + uint16_t *check; + int len; + mbuf_proto_headers(m); + iph = (struct nm_iphdr *)(nmb + poff + MBUF_L3_OFST(m)); + tcph = (struct nm_tcphdr *)(nmb + poff + MBUF_L4_OFST(m)); + MBUF_CSUM_DONE(m); + check = &tcph->check; + *check = 0; + len = slot->len - MBUF_L4_OFST(m); + nm_os_csum_tcpudp_ipv4(iph, tcph, len, check); + } + pst_fdt_add(cb, kring); + + /* the stack might hold reference via clone, so let's see */ + nmcb_wstate(cb, MB_TXREF); +#ifdef linux + /* in FreeBSD mbuf comes from our code */ + pst_get_extra_ref(kring); + nm_os_set_mbuf_data_destructor(m, &cb->ui, nm_os_pst_mbuf_data_dtor); +#endif /* linux */ + m_freem(m); + return 0; +} + +static void +pst_extra_free_kring(struct netmap_kring *kring) +{ + struct pst_extra_pool *extra; + + /* kring->nr_mode is NKR_NEMAP_OFF on do_unregif() after reg failure + * (e.g., for allocating some netmap object) + */ + if (!kring->extra) { + PST_DBG("%s kr %u no extra", kring->na->name, kring->ring_id); + return; + } + extra = kring->extra; + if (extra->busy != NM_EXT_NULL) { + PST_DBG("%s kr %u busy %u", + kring->na->name, kring->ring_id, extra->busy); + } + kring->extra = NULL; + extra->num = 0; + if (extra->slots) + nm_os_free(extra->slots); + nm_os_free(extra); +} + +static void +pst_extra_free(struct netmap_adapter *na) +{ + int i; + + for (i = 0; i < nma_get_nrings(na, NR_TX); i++) { + pst_extra_free_kring(NMR(na, NR_TX)[i]); + } +} + +static int +pst_extra_alloc_kring(struct netmap_kring *kring) +{ + struct netmap_adapter *na = kring->na; + struct pst_extra_pool *pool; + struct pst_extra_slot *extra_slots = NULL; + u_int want = paste_extra, n, j, next; + + pool = nm_os_malloc(sizeof(*pool)); + if (!pool) + return ENOMEM; + kring->extra = pool; // make extra_free_kring()-able + + n = netmap_extra_alloc(na, &next, want); + if (n == 0) { + return ENOMEM; + } else if (n < want) { + if (netmap_verbose) + nm_prinf("allocated only %u bufs", n); + } + kring->extra->num = n; + if (n) { + extra_slots = nm_os_malloc(sizeof(*extra_slots) * n); + if (!extra_slots) + return ENOMEM; + } + + for (j = 0; j < n; j++) { + struct pst_extra_slot *exs; + struct netmap_slot tmp = {.buf_idx = next}; + + exs = &extra_slots[j]; + exs->slot.buf_idx = next; + exs->slot.len = 0; + exs->slot.ptr = (exs->slot.ptr & ~kring->offset_mask) | + (sizeof(struct nmcb) & kring->offset_mask); + exs->prev = j == 0 ? NM_EXT_NULL : j - 1; + exs->next = j + 1 == n ? NM_EXT_NULL : j + 1; + next = *(uint32_t *)NMB(na, &tmp); + } + pool->free = 0; + pool->free_tail = n - 1; + pool->busy = pool->busy_tail = NM_EXT_NULL; + pool->slots = extra_slots; + return 0; +} + +static int +pst_extra_alloc(struct netmap_adapter *na) +{ + int i; + + /* we don't need extra on host rings */ + for (i = 0; i < nma_get_nrings(na, NR_TX); i++) { + if (pst_extra_alloc_kring(NMR(na, NR_TX)[i])) { + pst_extra_free(na); + return ENOMEM; + } + } + return 0; +} + +static void +pst_mbufpool_free(struct netmap_adapter *na) +{ + int i; + + for (i = 0; i < nma_get_nrings(na, NR_TX); i++) { + struct netmap_kring *kring = NMR(na, NR_TX)[i]; + if (kring->tx_pool == NULL) + continue; + if (kring->tx_pool[1]) { + m_freem((struct mbuf *)kring->tx_pool[1]); + kring->tx_pool[1] = NULL; + } + if (kring->tx_pool[0]) { + nm_os_free(kring->tx_pool[0]); + kring->tx_pool[0] = NULL; + } + nm_os_free(kring->tx_pool); + kring->tx_pool = NULL; + } +} + +/* Create extra buffers and mbuf pool */ +static int +pst_mbufpool_alloc(struct netmap_adapter *na) +{ + struct netmap_kring *kring; + int i, error = 0; + + for (i = 0; i < nma_get_nrings(na, NR_TX); i++) { + kring = NMR(na, NR_TX)[i]; + kring->tx_pool = nm_os_malloc(2 * sizeof(struct mbuf *)); + if (!kring->tx_pool) { + PST_DBG("tx_pool allocation failed"); + error = ENOMEM; + break; + } + kring->tx_pool[0] = nm_os_malloc(sizeof(struct mbuf)); + if (!kring->tx_pool[0]) { + error = ENOMEM; + break; + } + } + if (error) { + pst_mbufpool_free(na); + } + return error; +} + +static int +pst_kwait(void *data) +{ + struct netmap_pst_adapter *pna = (struct netmap_pst_adapter *)data; + struct netmap_priv_d *kpriv = pna->kpriv; + int lim = 20; + bool s = false; + + for (; lim > 0; s = 0) { + if (pna->num_so_adapters > 0) { + if (netmap_verbose) + nm_prinf("waiting for %d sockets to go", + pna->num_so_adapters); + s = true; + } + if (!pst_extra_noref(&pna->up.up)) { + if (netmap_verbose) + nm_prinf("waiting for mbufs to go"); + if (s == false) + lim--; + s = true; + } + if (!s) + break; + pause("netmap-pst-kwait-pause", 2000); + } + if (netmap_verbose) + nm_prinf("%s deleting priv", pna->up.up.name); + NMG_LOCK(); + pna->kpriv = NULL; + /* we don't clear pna->kwaittdp to indicate my run */ + netmap_priv_delete(kpriv); + NMG_UNLOCK(); +#ifdef __FreeBSD__ + kthread_exit(); +#endif /* __FreeBSD__ */ + return 0; +} + +static void +pst_write_offset(struct netmap_adapter *na, bool noring) +{ + enum txrx t; + u_int i, j; + const u_int offset = sizeof(struct nmcb); + const u_int mask = 0xff; + + for_rx_tx(t) { + for (i = 0; i < netmap_real_rings(na, t); i++) { + struct netmap_kring *kring = NMR(na, t)[i]; + struct netmap_ring *ring = kring->ring; + + kring->offset_max = mask; + kring->offset_mask = mask; + /* Since app threads individually register port/rings, + * there exist rings not enabled yet. + * + * We cannot use nm_kring_pending_on(). Unlike main + * rings, host rings have both mode and pending_mode + * turned on already by netmap_update_hostrings_mode(). + */ + if (noring) { + continue; + } else if (kring->nr_pending_mode != NKR_NETMAP_ON) { + PST_DBG("%s %s %d not ready", na->name, + t == NR_TX ? "tx" : "rx", i); + continue; + } + *(uint64_t *)(uintptr_t)&ring->offset_mask = mask; + for (j = 0; j < kring->nkr_num_slots; j++) { + nm_write_offset(kring, ring->slot + j, offset); + } + } + } +} + +static int +netmap_pst_bwrap_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; +#ifdef linux + struct netmap_hw_adapter *hw = (struct netmap_hw_adapter *)hwna; +#endif + + if (onoff) { + int i, error; + + if (na->nm_mem != pst_na(na)->nm_mem) { + PST_DBG("mem mismatch %s nm_mem %p %s nm_mem %p", + na->name, na->nm_mem, + pst_na(na)->name, pst_na(na)->nm_mem); + return EINVAL; + } + if (bna->up.na_bdg->bdg_active_ports > 3) { + PST_DBG("%s: only one NIC is supported", na->name); + return ENOTSUP; + } + /* netmap_do_regif just created rings. As we cannot rely on + * netmap_offsets_init, we set offsets here. + */ + pst_write_offset(na, 0); + pst_write_offset(hwna, 1); + + error = netmap_bwrap_reg(na, onoff); + if (error) + return error; + if (pst_extra_alloc(na)) { + PST_DBG("extra_alloc failed for bwrap"); + netmap_bwrap_reg(na, 0); + return ENOMEM; + } + if (pst_mbufpool_alloc(na)) { + PST_DBG("mbufpool_alloc failed for bwrap"); + pst_extra_free(na); + netmap_bwrap_reg(na, 0); + return ENOMEM; + } + + /* na->if_transmit already has backup */ +#ifdef linux + hw->nm_ndo.ndo_start_xmit = linux_pst_start_xmit; + /* re-overwrite */ + hwna->ifp->netdev_ops = &hw->nm_ndo; +#elif defined (__FreeBSD__) + hwna->ifp->if_transmit = netmap_pst_transmit; +#endif /* linux */ + + /* set void callback on host rings */ + for (i = nma_get_nrings(hwna, NR_RX); + i < netmap_real_rings(hwna, NR_RX); i++) { + NMR(hwna, NR_RX)[i]->nm_sync = netmap_vp_rxsync_locked; + } + } else { +#ifdef linux + /* restore default start_xmit for future register */ + ((struct netmap_hw_adapter *)hwna)->nm_ndo.ndo_start_xmit = + linux_netmap_start_xmit; +#else + hwna->ifp->if_transmit = hwna->if_transmit; +#endif + pst_mbufpool_free(na); + pst_extra_free(na); + return netmap_bwrap_reg(na, onoff); + } + return 0; +} + +static int +netmap_pst_bwrap_intr_notify(struct netmap_kring *kring, int flags) { + struct netmap_adapter *hwna = kring->na, *vpna, *mna; + enum txrx t = kring->tx ? NR_TX : NR_RX; + + vpna = (struct netmap_adapter *)hwna->na_private; + if (unlikely(!vpna)) + return NM_IRQ_COMPLETED; + + /* just wakeup the client on the master */ + mna = pst_na(vpna); + if (likely(mna)) { + //u_int me = kring - NMR(hwna, t), last; + u_int me = kring->ring_id, last; + struct netmap_kring *mk; + last = nma_get_nrings(mna, t); + mk = NMR(mna, t)[last > me ? me : me % last]; + mk->nm_notify(mk, 0); + } + return NM_IRQ_COMPLETED; +} + +static void +pst_unregister_socket(struct pst_so_adapter *soa) +{ + NM_SOCK_T *so = soa->so; + struct netmap_pst_adapter *pna = topna(soa->na); + + mtx_lock(&pna->so_adapters_lock); + if (soa->fd >= pna->so_adapters_max) + panic("non-registered or invalid fd %d", soa->fd); + pna->so_adapters[soa->fd] = NULL; + pna->num_so_adapters--; + SOCKBUF_LOCK(&so->so_rcv); + RESTORE_SOUPCALL(so, soa); + RESTORE_SODTOR(so, soa); + SOCKBUF_UNLOCK(&so->so_rcv); + pst_wso(NULL, so); + wmb(); + bzero(soa, sizeof(*soa)); + nm_os_free(soa); + mtx_unlock(&pna->so_adapters_lock); +} + +static void +pst_sodtor(NM_SOCK_T *so) +{ + if (pst_so(so)) + pst_unregister_socket(pst_so(so)); + if (so->so_dtor) { + so->so_dtor(so); + } +} + +void +pst_mbuf_data_dtor(struct nmcb *cb) +{ + if (unlikely(!nmcb_valid(cb))) { + PST_DBG("invalid cb %p", cb); + return; + } else if (unlikely(nmcb_kring(cb) == NULL)) { + PST_DBG("no kring in cb %p", cb); + return; + } + pst_put_extra_ref(nmcb_kring(cb)); + if (nmcb_rstate(cb) != MB_FTREF) { + /* consumed RX mbuf just bounces as not in extra pool */ + pst_extra_deq(nmcb_kring(cb), nmcb_slot(cb)); + nmcb_wstate(cb, MB_NOREF); + } +} + +/* Under NMG_LOCK() */ +static void +netmap_pst_bdg_dtor(const struct netmap_vp_adapter *vpna) +{ + struct netmap_pst_adapter *pna; + + if (&vpna->up != pst_na(&vpna->up)) + return; + + pna = (struct netmap_pst_adapter *)(void *)(uintptr_t)vpna; + mtx_lock(&pna->so_adapters_lock); + bzero(pna->so_adapters, sizeof(uintptr_t) * pna->so_adapters_max); + pna->so_adapters_max = 0; + nm_os_free(pna->so_adapters); + pna->so_adapters = NULL; + mtx_unlock(&pna->so_adapters_lock); + mtx_destroy(&pna->so_adapters_lock); + wmb(); + if (netmap_verbose) + nm_prinf("destroyed everything"); +} + +/* not NMG_LOCK held */ +int +netmap_pst_register_fd(struct netmap_adapter *na, int fd) +{ + NM_SOCK_T *so; + void *file; + struct pst_so_adapter *soa; + struct netmap_pst_adapter *pna = topna(na); + int error = 0; + + if (unlikely(fd > NM_PST_FD_MAX)) { + PST_DBG("fd %d too high", fd); + return ENOMEM; + } + if (unlikely(fd < 3)) { + PST_DBG("bad fd %d", fd); + return EINVAL; + } + so = nm_os_sock_fget(fd, &file); + if (!so) + return EINVAL; + if (!nm_os_so_connected(so)) { + nm_os_sock_fput(so, file); + return EINVAL; + } + + so_lock(so); + /* [Linux] It is possible that socket is disconnected but dead */ + if (nm_os_sock_dead(so)) { + PST_DBG("so %p SOCK_DEAD", so); + so_unlock(so); + nm_os_sock_fput(so, file); + return EINVAL; + } + if (pst_so(so)) { + PST_DBG("already registered %d", fd); + so_unlock(so); + nm_os_sock_fput(so, file); + return EBUSY; + } + + /*serialize simultaneous accept/config */ + mtx_lock(&pna->so_adapters_lock); + /* first check table size */ + if (fd >= pna->so_adapters_max) { + struct pst_so_adapter **old = pna->so_adapters, **new; + const int oldsize = pna->so_adapters_max; + const int newsize = oldsize ? oldsize * 2 : DEFAULT_SK_ADAPTERS; + + new = nm_os_malloc(sizeof(new) * newsize); + if (!new) { + PST_DBG("failed to extend fdtable"); + error = ENOMEM; + goto unlock_return; + } + if (old) { + memcpy(new, old, sizeof(old) * oldsize); + bzero(old, sizeof(old) * oldsize); + nm_os_free(old); + } + pna->so_adapters = new; + pna->so_adapters_max = newsize; + } + + soa = nm_os_malloc(sizeof(*soa)); + if (!soa) { + error = ENOMEM; + goto unlock_return; + } + SOCKBUF_LOCK(&so->so_rcv); + SAVE_SOUPCALL(so, soa); + SAVE_SODTOR(so, soa); + soa->na = na; + soa->so = so; + soa->fd = fd; + pst_wso(soa, so); + SET_SOUPCALL(so, nm_os_pst_upcall); + SET_SODTOR(so, pst_sodtor); + pna->so_adapters[fd] = soa; + pna->num_so_adapters++; + nm_os_sock_set_nocoalesce(&so->so_rcv); + wmb(); + SOCKBUF_UNLOCK(&so->so_rcv); + + SOCKBUF_LOCK(&so->so_snd); + nm_os_sock_set_nocoalesce(&so->so_snd); + SOCKBUF_UNLOCK(&so->so_snd); + +#ifdef linux + nm_os_set_nodelay(so); // within the same socket lock. +#endif +unlock_return: + if (!error) { + error = nm_os_pst_sbdrain(na, so); + } + mtx_unlock(&pna->so_adapters_lock); + so_unlock(so); +#ifdef __FreeBSD__ + if (!error) { + nm_os_set_nodelay(so); // inp must be locked outside our lock + } +#endif + nm_os_sock_fput(so, file); + return error; +} + +static int +netmap_pst_reg(struct netmap_adapter *na, int onoff) +{ + int err; + + if (onoff) { + pst_write_offset(na, 0); + if (na->active_fds > 0) { + goto vp_reg; + } + err = pst_extra_alloc(na); + if (err) + return err; + } + if (!onoff) { + struct nm_bridge *b = tovpna(na)->na_bdg; + int i; + struct netmap_pst_adapter *pna; + + if (na->active_fds > 0) + goto vp_reg; + + pna = (struct netmap_pst_adapter *)na; + if (netmap_verbose) + nm_prinf("%s active_fds %d num_so_adapters %d", + na->name, na->active_fds, pna->num_so_adapters); + if (!pna->kwaittdp && + (pna->num_so_adapters > 0 || !pst_extra_noref(na))) { + struct netmap_if *nifp; + enum txrx t; + struct netmap_priv_d *kpriv = netmap_priv_new(); + kpriv->np_na = na; + /* revert krings_put() - this also survives + * mem_rings_delete */ + for_rx_tx(t) { + kpriv->np_qfirst[t] = 0; + kpriv->np_qlast[t] = nma_get_nrings(na, t); + for (i = 0; i < nma_get_nrings(na, t); i++) { + NMR(na, t)[i]->users++; + NMR(na, t)[i]->nr_pending_mode = + NKR_NETMAP_ON; + } + } + err = netmap_mem_finalize(na->nm_mem, na); + if (err) + goto del_kpriv; + /* we don't need rings_create() */ + nifp = netmap_mem_if_new(na, kpriv); + if (nifp == NULL) { + err = ENOMEM; +del_kpriv: + netmap_priv_delete(kpriv); + pst_extra_free(na); + return err; + } + kpriv->np_nifp = nifp; + na->active_fds++; + + pna->kpriv = kpriv; + netmap_adapter_get(na); + /* we cannot die, create another and return */ + + if (netmap_verbose) + nm_prinf("spawning kwait"); + nm_os_kthread_add(pst_kwait, (void *)pna, NULL, + &pna->kwaittdp, 0, 0, "netmap-pst-kwait"); + return EBUSY; // XXX the caller doesn't care + } + + for_bdg_ports(i, b) { + struct netmap_vp_adapter *s; + struct netmap_adapter *bna; + struct nmreq_header hdr; + struct nmreq_port_hdr req; + int err; + if (i == 0) + continue; + s = b->bdg_ports[i]; + bzero(&hdr, sizeof(hdr)); + strncpy(hdr.nr_name, s->up.name, sizeof(hdr.nr_name)); + hdr.nr_reqtype = NETMAP_REQ_PST_DETACH; + hdr.nr_version = NETMAP_API; + hdr.nr_body = (uintptr_t)&req; + bna = &s->up; + netmap_adapter_get(bna); + if (bna->nm_bdg_ctl) { + err = bna->nm_bdg_ctl(&hdr, bna); + } + netmap_adapter_put(bna); + } + pst_extra_free(na); + } +vp_reg: + err = netmap_vp_reg(na, onoff); + return err; +} + +static inline int +pst_bdg_valid(struct netmap_adapter *na) +{ + struct nm_bridge *b = tovpna(na)->na_bdg; + + if (unlikely(b == NULL)) { + return 0; + } else if (unlikely(b->bdg_active_ports < 3)) { + if (netmap_verbose) + nm_prinf("active ports %d", b->bdg_active_ports); + return 0; + } + return 1; +} + +static int +netmap_pst_txsync(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + u_int const head = kring->rhead; + u_int done; + + if (unlikely(!pst_bdg_valid(na))) { + done = head; + return 0; + } + done = pst_prestack(kring); + + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, kring->nkr_num_slots - 1); + return 0; +} + +static int +netmap_pst_rxsync(struct netmap_kring *kring, int flags) +{ + struct netmap_pst_adapter *pna = topna(kring->na); + struct nm_bridge *b = pna->up.na_bdg; + int i, err; + u_int intr; + + err = netmap_vp_rxsync_locked(kring, flags); // reclaim buffers + if (err) + return err; + + intr = intr_disable(); // emulate software interrupt context + + for_bdg_ports(i, b) { + struct netmap_vp_adapter *vpna = b->bdg_ports[i]; + struct netmap_adapter *na = &vpna->up; + struct netmap_adapter *hwna; + u_int first, last, j, hostnr; + + if (netmap_bdg_idx(vpna) == netmap_bdg_idx(&pna->up)) + continue; + else if (is_host(na)) + continue; + hwna = ((struct netmap_bwrap_adapter *)vpna)->hwna; + hostnr = nma_get_host_nrings(hwna, NR_RX); + + first = kring->na->num_rx_rings > 1 ? kring->ring_id : 0; + last = na->num_rx_rings; + for (j = first; j < last; j += kring->na->num_rx_rings) { + struct netmap_kring *hwk, *bk, *hk; + hwk = NMR(hwna, NR_RX)[j]; + bk = NMR(na, NR_TX)[j]; + hk = NMR(hwna, NR_RX)[last + (j % hostnr)]; + /* + * the nm_os_pst_drain() context may have linked + * buffer(s) to the fdtable. + */ + if (pst_get_fdt(bk)->npkts > 0) { + pst_poststack(bk); + } else { + netmap_bwrap_intr_notify(hwk, 0); + if (paste_host_batch) { + netmap_bwrap_intr_notify(hk, 0); + } + } + } + } + intr_restore(intr); + return netmap_vp_rxsync_locked(kring, flags); +} + +static void +netmap_pst_dtor(struct netmap_adapter *na) +{ + struct nm_bridge *b = tovpna(na)->na_bdg; + + if (b) { + netmap_bdg_detach_common(b, tovpna(na)->bdg_port, -1); + } + if (na->ifp != NULL && !nm_iszombie(na)) { + NM_DETACH_NA(na->ifp); + } +} + +static void +netmap_pst_krings_delete(struct netmap_adapter *na) +{ + pst_fdt_free(na); + netmap_krings_delete(na); +} + +static int +netmap_pst_krings_create(struct netmap_adapter *na) +{ + int error = netmap_krings_create(na, 0); + + if (error) + return error; + error = pst_fdt_alloc(na); + if (error) + netmap_krings_delete(na); + return error; +} + +static void +netmap_pst_bwrap_krings_delete(struct netmap_adapter *na) +{ + netmap_bwrap_krings_delete_common(na); + netmap_pst_krings_delete(na); +} + +static int +netmap_pst_bwrap_krings_create(struct netmap_adapter *na) +{ + int error = netmap_pst_krings_create(na); + + if (error) + return error; + error = netmap_bwrap_krings_create_common(na); + if (error) { + netmap_pst_krings_delete(na); + } + return error; +} + +static int +netmap_pst_vp_create(struct nmreq_header *hdr, struct ifnet *ifp, + struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) +{ + struct nmreq_register *req = + (struct nmreq_register *)(uintptr_t)hdr->nr_body; + struct netmap_pst_adapter *pna; + struct netmap_vp_adapter *vpna; + struct netmap_adapter *na; + int error = 0; + u_int npipes = 0; + + if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { + return EINVAL; + } + + pna = nm_os_malloc(sizeof(*pna)); + if (pna == NULL) + return ENOMEM; + mtx_init(&pna->so_adapters_lock, "so_adapters_lock", NULL, MTX_DEF); + vpna = &pna->up; + na = &vpna->up; + + na->ifp = ifp; + strncpy(na->name, hdr->nr_name, sizeof(na->name)); + na->num_tx_rings = req->nr_tx_rings; + nm_bound_var(&na->num_tx_rings, 1, 1, NM_PST_MAXRINGS, NULL); + req->nr_tx_rings = na->num_tx_rings; /* write back */ + na->num_rx_rings = req->nr_rx_rings; + nm_bound_var(&na->num_rx_rings, 1, 1, NM_PST_MAXRINGS, NULL); + req->nr_rx_rings = na->num_rx_rings; /* write back */ + nm_bound_var(&req->nr_tx_slots, NM_PST_RINGSIZE, + 1, NM_PST_MAXSLOTS, NULL); + na->num_tx_desc = req->nr_tx_slots; + nm_bound_var(&req->nr_rx_slots, NM_PST_RINGSIZE, + 1, NM_PST_MAXSLOTS, NULL); + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); /* do we need this? */ + + /* XXX should we check extra bufs? */ + na->num_rx_desc = req->nr_rx_slots; + if (ifp) + na->na_flags |= NAF_NATIVE; + na->nm_txsync = netmap_pst_txsync; + na->nm_rxsync = netmap_pst_rxsync; + na->nm_register = netmap_pst_reg; + na->nm_krings_create = netmap_pst_krings_create; + na->nm_krings_delete = netmap_pst_krings_delete; + na->nm_dtor = netmap_pst_dtor; + na->nm_mem = nmd ? + netmap_mem_get(nmd): + netmap_mem_private_new( + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc, + req->nr_extra_bufs, npipes, &error); + if (na->nm_mem == NULL) + goto err; + error = netmap_attach_common(na); + if (error) + goto err; + *ret = vpna; + return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_put(na->nm_mem); + nm_os_free(pna); + return error; +} + +static int +netmap_pst_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) +{ + struct netmap_bwrap_adapter *bna; + struct netmap_adapter *na = NULL; + struct netmap_adapter *hostna = NULL; + int error; + + if (!nm_os_hwcsum_ok(hwna)) { +#ifdef __FreeBSD__ + nm_prinf("paste needs checksum offload disabled"); +#else + nm_prinf("paste needs checksum offload enabled"); +#endif /* __FreeBSD__ */ + return ENOTSUP; + } + + bna = nm_os_malloc(sizeof(*bna)); + if (bna == NULL) { + return ENOMEM; + } + na = &bna->up.up; + strncpy(na->name, nr_name, sizeof(na->name)); + na->nm_register = netmap_pst_bwrap_reg; + na->nm_txsync = netmap_pst_txsync; + na->nm_krings_create = netmap_pst_bwrap_krings_create; + na->nm_krings_delete = netmap_pst_bwrap_krings_delete; + na->nm_notify = netmap_bwrap_notify; + na->na_flags |= NAF_MOREFRAG; // survive netmap_buf_size_validate() + na->na_flags |= NAF_HOST_ALL; + + bna->nm_intr_notify = netmap_pst_bwrap_intr_notify; + + if (hwna->na_flags & NAF_HOST_RINGS) { + hostna = &bna->host.up; + hostna->nm_notify = netmap_bwrap_notify; + } + + error = netmap_bwrap_attach_common(na, hwna); + if (error) { + nm_os_free(bna); + } + return error; +} + +struct netmap_bdg_ops pst_bdg_ops = { + .lookup = NULL, + .dtor = netmap_pst_bdg_dtor, + .vp_create = netmap_pst_vp_create, + .bwrap_attach = netmap_pst_bwrap_attach, + .name = NM_PST_NAME, +}; + + +int +netmap_get_pst_na(struct nmreq_header *hdr, struct netmap_adapter **na, + struct netmap_mem_d *nmd, int create) +{ + return netmap_get_bdg_na(hdr, na, nmd, create, &pst_bdg_ops); +} +#endif /* WITH_PASTE */ diff --git a/sys/net/netmap.h b/sys/net/netmap.h index 6561174e0..7d8e5ae7e 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -555,6 +555,13 @@ enum { NETMAP_REQ_SYNC_KLOOP_STOP, /* Enable CSB mode on a registered netmap control device. */ NETMAP_REQ_CSB_ENABLE, + /* Attach a netmap port to a PASTE port. */ + NETMAP_REQ_PST_ATTACH, + /* Detach a netmap port from a PASTE port. */ + NETMAP_REQ_PST_DETACH, + /* Register a fd to a PASTE port. */ + /* Detach a netmap port from a PASTE port. */ + NETMAP_REQ_PST_FD_REG, }; enum { @@ -678,7 +685,8 @@ struct nmreq_port_info_get { uint16_t pad[3]; }; -#define NM_BDG_NAME "vale" /* prefix for bridge port name */ +#define NM_BDG_NAME "vale" /* prefix for vale port name */ +#define NM_PST_NAME "pst" /* prefix for stack port name */ /* * nr_reqtype: NETMAP_REQ_VALE_ATTACH @@ -790,6 +798,11 @@ struct nmreq_sync_kloop_start { uint32_t pad1; }; +struct nmreq_pst_fd_reg { + uint32_t fd; + uint32_t pad1; +}; + /* A CSB entry for the application --> kernel direction. */ struct nm_csb_atok { uint32_t head; /* AW+ KR+ the head of the appl netmap_ring */ diff --git a/sys/net/netmap_paste.h b/sys/net/netmap_paste.h new file mode 100644 index 000000000..368601619 --- /dev/null +++ b/sys/net/netmap_paste.h @@ -0,0 +1,41 @@ +#ifndef NETMAP_PASTE_H +#define NETMAP_PASTE_H + +static const uint64_t NS_PST_FD_MASK = 0x000000ffffffff00; +static const int NS_PST_FD_SHIFT = 8; +static const uint64_t NS_PST_OFST_MASK = 0x0003ff0000000000; +static const int NS_PST_OFST_SHIFT = 40; + +static inline int32_t +nm_pst_getfd(struct netmap_slot *slot) +{ + return (int32_t)((slot->ptr & NS_PST_FD_MASK) >> NS_PST_FD_SHIFT); +} + +static inline void +nm_pst_setfd(struct netmap_slot *slot, int32_t fd) +{ + slot->ptr = (slot->ptr & ~NS_PST_FD_MASK ) | + (( (uint64_t)fd << NS_PST_FD_SHIFT) & NS_PST_FD_MASK); +} + +static inline uint16_t +nm_pst_getdoff(struct netmap_slot *slot) +{ + return (uint16_t) + ((slot->ptr & NS_PST_OFST_MASK) >> NS_PST_OFST_SHIFT); +} + +static inline void +nm_pst_setdoff(struct netmap_slot *slot, uint16_t ofst) +{ + slot->ptr = (slot->ptr & ~NS_PST_OFST_MASK) | + (( (uint64_t)ofst << NS_PST_OFST_SHIFT) & NS_PST_OFST_MASK); +} + +static inline void +nm_pst_reset_fddoff(struct netmap_slot *slot) +{ + slot->ptr = (slot->ptr & ~(NS_PST_FD_MASK | NS_PST_OFST_MASK)); +} +#endif /* NETMAP_PASTE_H */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index e17d2dcbe..e8983565d 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -986,7 +986,7 @@ nm_close(struct nm_desc *d) close(d->fd); } - bzero(d, sizeof(*d)); + bzero((char *)d, sizeof(*d)); free(d); return 0; }