/* $NetBSD: xennetback_xenbus.c,v 1.112 2023/02/25 00:37:47 riastradh Exp $ */
/*
* Copyright (c) 2006 Manuel Bouyer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xennetback_xenbus.c,v 1.112 2023/02/25 00:37:47 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/bpf.h>
#include <net/if_ether.h>
#include <xen/intr.h>
#include <xen/hypervisor.h>
#include <xen/xen.h>
#include <xen/xen_shm.h>
#include <xen/evtchn.h>
#include <xen/xenbus.h>
#include <xen/xennet_checksum.h>
#include <uvm/uvm.h>
/*
* Backend network device driver for Xen.
*/
#ifdef XENDEBUG_NET
#define XENPRINTF(x) printf x
#else
#define XENPRINTF(x)
#endif
#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
/*
* Number of packets to transmit in one hypercall (= number of pages to
* transmit at once).
*/
#define NB_XMIT_PAGES_BATCH 64
CTASSERT(NB_XMIT_PAGES_BATCH >= XEN_NETIF_NR_SLOTS_MIN);
/* ratecheck(9) for pool allocation failures */
static const struct timeval xni_pool_errintvl = { 30, 0 }; /* 30s, each */
/* state of a xnetback instance */
typedef enum {
CONNECTED,
DISCONNECTING,
DISCONNECTED
} xnetback_state_t;
struct xnetback_xstate {
bus_dmamap_t xs_dmamap;
bool xs_loaded;
struct mbuf *xs_m;
struct netif_tx_request xs_tx;
uint16_t xs_tx_size; /* Size of data in this Tx fragment */
};
/* we keep the xnetback instances in a linked list */
struct xnetback_instance {
SLIST_ENTRY(xnetback_instance) next;
struct xenbus_device *xni_xbusd; /* our xenstore entry */
domid_t xni_domid; /* attached to this domain */
uint32_t xni_handle; /* domain-specific handle */
xnetback_state_t xni_status;
/* network interface stuff */
struct ethercom xni_ec;
struct callout xni_restart;
uint8_t xni_enaddr[ETHER_ADDR_LEN];
/* remote domain communication stuff */
unsigned int xni_evtchn; /* our event channel */
struct intrhand *xni_ih;
netif_tx_back_ring_t xni_txring;
netif_rx_back_ring_t xni_rxring;
grant_handle_t xni_tx_ring_handle; /* to unmap the ring */
grant_handle_t xni_rx_ring_handle;
vaddr_t xni_tx_ring_va; /* to unmap the ring */
vaddr_t xni_rx_ring_va;
/* arrays used in xennetback_ifstart(), used for both Rx and Tx */
gnttab_copy_t xni_gop_copy[NB_XMIT_PAGES_BATCH];
struct xnetback_xstate xni_xstate[NB_XMIT_PAGES_BATCH];
/* event counters */
struct evcnt xni_cnt_rx_cksum_blank;
struct evcnt xni_cnt_rx_cksum_undefer;
};
#define xni_if xni_ec.ec_if
#define xni_bpf xni_if.if_bpf
void xvifattach(int);
static int xennetback_ifioctl(struct ifnet *, u_long, void *);
static void xennetback_ifstart(struct ifnet *);
static void xennetback_ifsoftstart_copy(struct xnetback_instance *);
static void xennetback_ifwatchdog(struct ifnet *);
static int xennetback_ifinit(struct ifnet *);
static void xennetback_ifstop(struct ifnet *, int);
static int xennetback_xenbus_create(struct xenbus_device *);
static int xennetback_xenbus_destroy(void *);
static void xennetback_frontend_changed(void *, XenbusState);
static inline void xennetback_tx_response(struct xnetback_instance *,
int, int);
static SLIST_HEAD(, xnetback_instance) xnetback_instances;
static kmutex_t xnetback_lock;
static bool xnetif_lookup(domid_t, uint32_t);
static int xennetback_evthandler(void *);
static struct xenbus_backend_driver xvif_backend_driver = {
.xbakd_create = xennetback_xenbus_create,
.xbakd_type = "vif"
};
void
xvifattach(int n)
{
XENPRINTF(("xennetback_init\n"));
SLIST_INIT(&xnetback_instances);
mutex_init(&xnetback_lock, MUTEX_DEFAULT, IPL_NONE);
xenbus_backend_register(&xvif_backend_driver);
}
static int
xennetback_xenbus_create(struct xenbus_device *xbusd)
{
struct xnetback_instance *xneti;
long domid, handle;
struct ifnet *ifp;
extern int ifqmaxlen; /* XXX */
char *e, *p;
char mac[32];
int i, err;
struct xenbus_transaction *xbt;
if ((err = xenbus_read_ul(NULL, xbusd->xbusd_path,
"frontend-id", &domid, 10)) != 0) {
aprint_error("xvif: can't read %s/frontend-id: %d\n",
xbusd->xbusd_path, err);
return err;
}
if ((err = xenbus_read_ul(NULL, xbusd->xbusd_path,
"handle", &handle, 10)) != 0) {
aprint_error("xvif: can't read %s/handle: %d\n",
xbusd->xbusd_path, err);
return err;
}
xneti = kmem_zalloc(sizeof(*xneti), KM_SLEEP);
xneti->xni_domid = domid;
xneti->xni_handle = handle;
xneti->xni_status = DISCONNECTED;
/* Need to keep the lock for lookup and the list update */
mutex_enter(&xnetback_lock);
if (xnetif_lookup(domid, handle)) {
mutex_exit(&xnetback_lock);
kmem_free(xneti, sizeof(*xneti));
return EEXIST;
}
SLIST_INSERT_HEAD(&xnetback_instances, xneti, next);
mutex_exit(&xnetback_lock);
xbusd->xbusd_u.b.b_cookie = xneti;
xbusd->xbusd_u.b.b_detach = xennetback_xenbus_destroy;
xneti->xni_xbusd = xbusd;
ifp = &xneti->xni_if;
ifp->if_softc = xneti;
snprintf(ifp->if_xname, IFNAMSIZ, "xvif%di%d",
(int)domid, (int)handle);
/* read mac address */
err = xenbus_read(NULL, xbusd->xbusd_path, "mac", mac, sizeof(mac));
if (err) {
aprint_error_ifnet(ifp, "can't read %s/mac: %d\n",
xbusd->xbusd_path, err);
goto fail;
}
for (i = 0, p = mac; i < ETHER_ADDR_LEN; i++) {
xneti->xni_enaddr[i] = strtoul(p, &e, 16);
if ((e[0] == '\0' && i != 5) && e[0] != ':') {
aprint_error_ifnet(ifp,
"%s is not a valid mac address\n", mac);
err = EINVAL;
goto fail;
}
p = &e[1];
}
/* we can't use the same MAC addr as our guest */
xneti->xni_enaddr[3]++;
/* Initialize DMA map, used only for loading PA */
for (i = 0; i < __arraycount(xneti->xni_xstate); i++) {
if (bus_dmamap_create(xneti->xni_xbusd->xbusd_dmat,
ETHER_MAX_LEN_JUMBO, XEN_NETIF_NR_SLOTS_MIN,
PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
&xneti->xni_xstate[i].xs_dmamap)
!= 0) {
aprint_error_ifnet(ifp,
"failed to allocate dma map\n");
err = ENOMEM;
goto fail;
}
}
evcnt_attach_dynamic(&xneti->xni_cnt_rx_cksum_blank, EVCNT_TYPE_MISC,
NULL, ifp->if_xname, "Rx csum blank");
evcnt_attach_dynamic(&xneti->xni_cnt_rx_cksum_undefer, EVCNT_TYPE_MISC,
NULL, ifp->if_xname, "Rx csum undeferred");
/* create pseudo-interface */
aprint_verbose_ifnet(ifp, "Ethernet address %s\n",
ether_sprintf(xneti->xni_enaddr));
xneti->xni_ec.ec_capabilities |= ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU;
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_snd.ifq_maxlen =
uimax(ifqmaxlen, NET_TX_RING_SIZE * 2);
ifp->if_capabilities =
IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv4_Tx
| IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv4_Tx
| IFCAP_CSUM_UDPv6_Rx | IFCAP_CSUM_UDPv6_Tx
| IFCAP_CSUM_TCPv6_Rx | IFCAP_CSUM_TCPv6_Tx;
#define XN_M_CSUM_SUPPORTED \
(M_CSUM_TCPv4 | M_CSUM_UDPv4 | M_CSUM_TCPv6 | M_CSUM_UDPv6)
ifp->if_ioctl = xennetback_ifioctl;
ifp->if_start = xennetback_ifstart;
ifp->if_watchdog = xennetback_ifwatchdog;
ifp->if_init = xennetback_ifinit;
ifp->if_stop = xennetback_ifstop;
ifp->if_timer = 0;
IFQ_SET_MAXLEN(&ifp->if_snd, uimax(2 * NET_TX_RING_SIZE, IFQ_MAXLEN));
IFQ_SET_READY(&ifp->if_snd);
if_attach(ifp);
if_deferred_start_init(ifp, NULL);
ether_ifattach(&xneti->xni_if, xneti->xni_enaddr);
xbusd->xbusd_otherend_changed = xennetback_frontend_changed;
do {
xbt = xenbus_transaction_start();
if (xbt == NULL) {
aprint_error_ifnet(ifp,
"%s: can't start transaction\n",
xbusd->xbusd_path);
goto fail;
}
err = xenbus_printf(xbt, xbusd->xbusd_path,
"vifname", "%s", ifp->if_xname);
if (err) {
aprint_error_ifnet(ifp,
"failed to write %s/vifname: %d\n",
xbusd->xbusd_path, err);
goto abort_xbt;
}
err = xenbus_printf(xbt, xbusd->xbusd_path,
"feature-rx-copy", "%d", 1);
if (err) {
aprint_error_ifnet(ifp,
"failed to write %s/feature-rx-copy: %d\n",
xbusd->xbusd_path, err);
goto abort_xbt;
}
err = xenbus_printf(xbt, xbusd->xbusd_path,
"feature-ipv6-csum-offload", "%d", 1);
if (err) {
aprint_error_ifnet(ifp,
"failed to write %s/feature-ipv6-csum-offload: %d\n",
xbusd->xbusd_path, err);
goto abort_xbt;
}
err = xenbus_printf(xbt, xbusd->xbusd_path,
"feature-sg", "%d", 1);
if (err) {
aprint_error_ifnet(ifp,
"failed to write %s/feature-sg: %d\n",
xbusd->xbusd_path, err);
goto abort_xbt;
}
} while ((err = xenbus_transaction_end(xbt, 0)) == EAGAIN);
if (err) {
aprint_error_ifnet(ifp,
"%s: can't end transaction: %d\n",
xbusd->xbusd_path, err);
}
err = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
if (err) {
aprint_error_ifnet(ifp,
"failed to switch state on %s: %d\n",
xbusd->xbusd_path, err);
goto fail;
}
return 0;
abort_xbt:
xenbus_transaction_end(xbt, 1);
fail:
kmem_free(xneti, sizeof(*xneti));
return err;
}
int
xennetback_xenbus_destroy(void *arg)
{
struct xnetback_instance *xneti = arg;
aprint_verbose_ifnet(&xneti->xni_if, "disconnecting\n");
if (xneti->xni_ih != NULL) {
hypervisor_mask_event(xneti->xni_evtchn);
xen_intr_disestablish(xneti->xni_ih);
xneti->xni_ih = NULL;
}
mutex_enter(&xnetback_lock);
SLIST_REMOVE(&xnetback_instances,
xneti, xnetback_instance, next);
mutex_exit(&xnetback_lock);
ether_ifdetach(&xneti->xni_if);
if_detach(&xneti->xni_if);
evcnt_detach(&xneti->xni_cnt_rx_cksum_blank);
evcnt_detach(&xneti->xni_cnt_rx_cksum_undefer);
/* Destroy DMA maps */
for (int i = 0; i < __arraycount(xneti->xni_xstate); i++) {
if (xneti->xni_xstate[i].xs_dmamap != NULL) {
bus_dmamap_destroy(xneti->xni_xbusd->xbusd_dmat,
xneti->xni_xstate[i].xs_dmamap);
xneti->xni_xstate[i].xs_dmamap = NULL;
}
}
if (xneti->xni_txring.sring) {
xen_shm_unmap(xneti->xni_tx_ring_va, 1,
&xneti->xni_tx_ring_handle);
}
if (xneti->xni_rxring.sring) {
xen_shm_unmap(xneti->xni_rx_ring_va, 1,
&xneti->xni_rx_ring_handle);
}
if (xneti->xni_tx_ring_va != 0) {
uvm_km_free(kernel_map, xneti->xni_tx_ring_va,
PAGE_SIZE, UVM_KMF_VAONLY);
xneti->xni_tx_ring_va = 0;
}
if (xneti->xni_rx_ring_va != 0) {
uvm_km_free(kernel_map, xneti->xni_rx_ring_va,
PAGE_SIZE, UVM_KMF_VAONLY);
xneti->xni_rx_ring_va = 0;
}
kmem_free(xneti, sizeof(*xneti));
return 0;
}
static int
xennetback_connect(struct xnetback_instance *xneti)
{
int err;
netif_tx_sring_t *tx_ring;
netif_rx_sring_t *rx_ring;
evtchn_op_t evop;
u_long tx_ring_ref, rx_ring_ref;
grant_ref_t gtx_ring_ref, grx_ring_ref;
u_long revtchn, rx_copy;
struct xenbus_device *xbusd = xneti->xni_xbusd;
/* read communication information */
err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
"tx-ring-ref", &tx_ring_ref, 10);
if (err) {
xenbus_dev_fatal(xbusd, err, "reading %s/tx-ring-ref",
xbusd->xbusd_otherend);
return -1;
}
err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
"rx-ring-ref", &rx_ring_ref, 10);
if (err) {
xenbus_dev_fatal(xbusd, err, "reading %s/rx-ring-ref",
xbusd->xbusd_otherend);
return -1;
}
err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
"event-channel", &revtchn, 10);
if (err) {
xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
xbusd->xbusd_otherend);
return -1;
}
err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
"request-rx-copy", &rx_copy, 10);
if (err == ENOENT || !rx_copy) {
xenbus_dev_fatal(xbusd, err,
"%s/request-rx-copy not supported by frontend",
xbusd->xbusd_otherend);
return -1;
} else if (err) {
xenbus_dev_fatal(xbusd, err, "reading %s/request-rx-copy",
xbusd->xbusd_otherend);
return -1;
}
/* allocate VA space and map rings */
xneti->xni_tx_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
if (xneti->xni_tx_ring_va == 0) {
xenbus_dev_fatal(xbusd, ENOMEM,
"can't get VA for TX ring", xbusd->xbusd_otherend);
goto err1;
}
tx_ring = (void *)xneti->xni_tx_ring_va;
xneti->xni_rx_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
if (xneti->xni_rx_ring_va == 0) {
xenbus_dev_fatal(xbusd, ENOMEM,
"can't get VA for RX ring", xbusd->xbusd_otherend);
goto err1;
}
rx_ring = (void *)xneti->xni_rx_ring_va;
gtx_ring_ref = tx_ring_ref;
if (xen_shm_map(1, xneti->xni_domid, >x_ring_ref,
xneti->xni_tx_ring_va, &xneti->xni_tx_ring_handle, 0) != 0) {
aprint_error_ifnet(&xneti->xni_if,
"can't map TX grant ref\n");
goto err2;
}
BACK_RING_INIT(&xneti->xni_txring, tx_ring, PAGE_SIZE);
grx_ring_ref = rx_ring_ref;
if (xen_shm_map(1, xneti->xni_domid, &grx_ring_ref,
xneti->xni_rx_ring_va, &xneti->xni_rx_ring_handle, 0) != 0) {
aprint_error_ifnet(&xneti->xni_if,
"can't map RX grant ref\n");
goto err2;
}
BACK_RING_INIT(&xneti->xni_rxring, rx_ring, PAGE_SIZE);
evop.cmd = EVTCHNOP_bind_interdomain;
evop.u.bind_interdomain.remote_dom = xneti->xni_domid;
evop.u.bind_interdomain.remote_port = revtchn;
err = HYPERVISOR_event_channel_op(&evop);
if (err) {
aprint_error_ifnet(&xneti->xni_if,
"can't get event channel: %d\n", err);
goto err2;
}
xneti->xni_evtchn = evop.u.bind_interdomain.local_port;
xneti->xni_status = CONNECTED;
xneti->xni_ih = xen_intr_establish_xname(-1, &xen_pic,
xneti->xni_evtchn, IST_LEVEL, IPL_NET, xennetback_evthandler,
xneti, false, xneti->xni_if.if_xname);
KASSERT(xneti->xni_ih != NULL);
xennetback_ifinit(&xneti->xni_if);
hypervisor_unmask_event(xneti->xni_evtchn);
hypervisor_notify_via_evtchn(xneti->xni_evtchn);
return 0;
err2:
/* unmap rings */
if (xneti->xni_tx_ring_handle != 0) {
xen_shm_unmap(xneti->xni_tx_ring_va, 1,
&xneti->xni_tx_ring_handle);
}
if (xneti->xni_rx_ring_handle != 0) {
xen_shm_unmap(xneti->xni_rx_ring_va, 1,
&xneti->xni_rx_ring_handle);
}
err1:
/* free rings VA space */
if (xneti->xni_rx_ring_va != 0)
uvm_km_free(kernel_map, xneti->xni_rx_ring_va,
PAGE_SIZE, UVM_KMF_VAONLY);
if (xneti->xni_tx_ring_va != 0)
uvm_km_free(kernel_map, xneti->xni_tx_ring_va,
PAGE_SIZE, UVM_KMF_VAONLY);
return -1;
}
static void
xennetback_frontend_changed(void *arg, XenbusState new_state)
{
struct xnetback_instance *xneti = arg;
struct xenbus_device *xbusd = xneti->xni_xbusd;
XENPRINTF(("%s: new state %d\n", xneti->xni_if.if_xname, new_state));
switch(new_state) {
case XenbusStateInitialising:
case XenbusStateInitialised:
break;
case XenbusStateConnected:
if (xneti->xni_status == CONNECTED)
break;
if (xennetback_connect(xneti) == 0)
xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
break;
case XenbusStateClosing:
xneti->xni_status = DISCONNECTING;
xneti->xni_if.if_flags &= ~IFF_RUNNING;
xneti->xni_if.if_timer = 0;
xenbus_switch_state(xbusd, NULL, XenbusStateClosing);
break;
case XenbusStateClosed:
/* otherend_changed() should handle it for us */
panic("xennetback_frontend_changed: closed\n");
case XenbusStateUnknown:
case XenbusStateInitWait:
default:
aprint_error("%s: invalid frontend state %d\n",
xneti->xni_if.if_xname, new_state);
break;
}
return;
}
/* lookup a xneti based on domain id and interface handle */
static bool
xnetif_lookup(domid_t dom , uint32_t handle)
{
struct xnetback_instance *xneti;
bool found = false;
KASSERT(mutex_owned(&xnetback_lock));
SLIST_FOREACH(xneti, &xnetback_instances, next) {
if (xneti->xni_domid == dom && xneti->xni_handle == handle) {
found = true;
break;
}
}
return found;
}
static inline void
xennetback_tx_response(struct xnetback_instance *xneti, int id, int status)
{
RING_IDX resp_prod;
netif_tx_response_t *txresp;
int do_event;
resp_prod = xneti->xni_txring.rsp_prod_pvt;
txresp = RING_GET_RESPONSE(&xneti->xni_txring, resp_prod);
txresp->id = id;
txresp->status = status;
xneti->xni_txring.rsp_prod_pvt++;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xneti->xni_txring, do_event);
if (do_event) {
XENPRINTF(("%s send event\n", xneti->xni_if.if_xname));
hypervisor_notify_via_evtchn(xneti->xni_evtchn);
}
}
static const char *
xennetback_tx_check_packet(const netif_tx_request_t *txreq)
{
if (__predict_false((txreq->flags & NETTXF_more_data) == 0 &&
txreq->offset + txreq->size > PAGE_SIZE))
return "crossing page boundary";
if (__predict_false(txreq->size > ETHER_MAX_LEN_JUMBO))
return "bigger then jumbo";
return NULL;
}
static int
xennetback_copy(struct ifnet *ifp, gnttab_copy_t *gop, int copycnt,
const char *dir)
{
/*
* Copy the data and ack it. Delaying it until the mbuf is
* freed will stall transmit.
*/
if (HYPERVISOR_grant_table_op(GNTTABOP_copy, gop, copycnt) != 0) {
printf("%s: GNTTABOP_copy %s failed", ifp->if_xname, dir);
return EINVAL;
}
for (int i = 0; i < copycnt; i++) {
if (gop->status != GNTST_okay) {
printf("%s GNTTABOP_copy[%d] %s %d\n",
ifp->if_xname, i, dir, gop->status);
return EINVAL;
}
}
return 0;
}
static void
xennetback_tx_copy_abort(struct ifnet *ifp, struct xnetback_instance *xneti,
int queued)
{
struct xnetback_xstate *xst;
for (int i = 0; i < queued; i++) {
xst = &xneti->xni_xstate[i];
if (xst->xs_loaded) {
KASSERT(xst->xs_m != NULL);
bus_dmamap_unload(xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap);
xst->xs_loaded = false;
m_freem(xst->xs_m);
}
xennetback_tx_response(xneti, xst->xs_tx.id, NETIF_RSP_ERROR);
if_statinc(ifp, if_ierrors);
}
}
static void
xennetback_tx_copy_process(struct ifnet *ifp, struct xnetback_instance *xneti,
int queued)
{
gnttab_copy_t *gop;
struct xnetback_xstate *xst;
int copycnt = 0, seg = 0;
size_t goff = 0, segoff = 0, gsize, take;
bus_dmamap_t dm = NULL;
paddr_t ma;
for (int i = 0; i < queued; i++) {
xst = &xneti->xni_xstate[i];
if (xst->xs_m != NULL) {
KASSERT(xst->xs_m->m_pkthdr.len == xst->xs_tx.size);
if (__predict_false(bus_dmamap_load_mbuf(
xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap, xst->xs_m, BUS_DMA_NOWAIT) != 0))
goto abort;
xst->xs_loaded = true;
dm = xst->xs_dmamap;
seg = 0;
goff = segoff = 0;
}
gsize = xst->xs_tx_size;
goff = 0;
for (; seg < dm->dm_nsegs && gsize > 0; seg++) {
bus_dma_segment_t *ds = &dm->dm_segs[seg];
ma = ds->ds_addr;
take = uimin(gsize, ds->ds_len);
KASSERT(copycnt <= NB_XMIT_PAGES_BATCH);
if (copycnt == NB_XMIT_PAGES_BATCH) {
if (xennetback_copy(ifp, xneti->xni_gop_copy,
copycnt, "Tx") != 0)
goto abort;
copycnt = 0;
}
/* Queue for the copy */
gop = &xneti->xni_gop_copy[copycnt++];
memset(gop, 0, sizeof(*gop));
gop->flags = GNTCOPY_source_gref;
gop->len = take;
gop->source.u.ref = xst->xs_tx.gref;
gop->source.offset = xst->xs_tx.offset + goff;
gop->source.domid = xneti->xni_domid;
gop->dest.offset = (ma & PAGE_MASK) + segoff;
KASSERT(gop->dest.offset <= PAGE_SIZE);
gop->dest.domid = DOMID_SELF;
gop->dest.u.gmfn = ma >> PAGE_SHIFT;
goff += take;
gsize -= take;
if (take + segoff < ds->ds_len) {
segoff += take;
/* Segment not completely consumed yet */
break;
}
segoff = 0;
}
KASSERT(gsize == 0);
KASSERT(goff == xst->xs_tx_size);
}
if (copycnt > 0) {
if (xennetback_copy(ifp, xneti->xni_gop_copy, copycnt, "Tx"))
goto abort;
copycnt = 0;
}
/* If we got here, the whole copy was successful */
for (int i = 0; i < queued; i++) {
xst = &xneti->xni_xstate[i];
xennetback_tx_response(xneti, xst->xs_tx.id, NETIF_RSP_OKAY);
if (xst->xs_m != NULL) {
KASSERT(xst->xs_loaded);
bus_dmamap_unload(xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap);
if (xst->xs_tx.flags & NETTXF_csum_blank) {
xennet_checksum_fill(ifp, xst->xs_m,
&xneti->xni_cnt_rx_cksum_blank,
&xneti->xni_cnt_rx_cksum_undefer);
} else if (xst->xs_tx.flags & NETTXF_data_validated) {
xst->xs_m->m_pkthdr.csum_flags =
XN_M_CSUM_SUPPORTED;
}
m_set_rcvif(xst->xs_m, ifp);
if_percpuq_enqueue(ifp->if_percpuq, xst->xs_m);
}
}
return;
abort:
xennetback_tx_copy_abort(ifp, xneti, queued);
}
static int
xennetback_tx_m0len_fragment(struct xnetback_instance *xneti,
int m0_len, int req_cons, int *cntp)
{
netif_tx_request_t *txreq;
/* This assumes all the requests are already pushed into the ring */
*cntp = 1;
do {
txreq = RING_GET_REQUEST(&xneti->xni_txring, req_cons);
KASSERT(m0_len > txreq->size);
m0_len -= txreq->size;
req_cons++;
(*cntp)++;
} while (txreq->flags & NETTXF_more_data);
return m0_len;
}
static int
xennetback_evthandler(void *arg)
{
struct xnetback_instance *xneti = arg;
struct ifnet *ifp = &xneti->xni_if;
netif_tx_request_t txreq;
struct mbuf *m, *m0 = NULL, *mlast = NULL;
int receive_pending;
RING_IDX req_cons, req_prod;
int queued = 0, m0_len = 0;
struct xnetback_xstate *xst;
const bool discard = ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
(IFF_UP | IFF_RUNNING));
XENPRINTF(("xennetback_evthandler "));
req_cons = xneti->xni_txring.req_cons;
again:
req_prod = xneti->xni_txring.sring->req_prod;
xen_rmb();
while (req_cons != req_prod) {
RING_COPY_REQUEST(&xneti->xni_txring, req_cons,
&txreq);
XENPRINTF(("%s pkt size %d\n", xneti->xni_if.if_xname,
txreq.size));
req_cons++;
if (__predict_false(discard)) {
/* interface not up, drop all requests */
if_statinc(ifp, if_iqdrops);
xennetback_tx_response(xneti, txreq.id,
NETIF_RSP_DROPPED);
continue;
}
/*
* Do some sanity checks, and queue copy of the data.
*/
const char *msg = xennetback_tx_check_packet(&txreq);
if (__predict_false(msg != NULL)) {
printf("%s: packet with size %d is %s\n",
ifp->if_xname, txreq.size, msg);
xennetback_tx_response(xneti, txreq.id,
NETIF_RSP_ERROR);
if_statinc(ifp, if_ierrors);
continue;
}
/* get a mbuf for this fragment */
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (__predict_false(m == NULL)) {
static struct timeval lasttime;
mbuf_fail:
if (ratecheck(&lasttime, &xni_pool_errintvl))
printf("%s: mbuf alloc failed\n",
ifp->if_xname);
xennetback_tx_copy_abort(ifp, xneti, queued);
queued = 0;
m0 = NULL;
xennetback_tx_response(xneti, txreq.id,
NETIF_RSP_DROPPED);
if_statinc(ifp, if_ierrors);
continue;
}
m->m_len = m->m_pkthdr.len = txreq.size;
if (!m0 && (txreq.flags & NETTXF_more_data)) {
/*
* The first fragment of multi-fragment Tx request
* contains total size. Need to read whole
* chain to determine actual size of the first
* (i.e. current) fragment.
*/
int cnt;
m0_len = xennetback_tx_m0len_fragment(xneti,
txreq.size, req_cons, &cnt);
m->m_len = m0_len;
KASSERT(cnt <= XEN_NETIF_NR_SLOTS_MIN);
if (queued + cnt >= NB_XMIT_PAGES_BATCH) {
/*
* Flush queue if too full to fit this
* new packet whole.
*/
KASSERT(m0 == NULL);
xennetback_tx_copy_process(ifp, xneti, queued);
queued = 0;
}
}
if (m->m_len > MHLEN) {
MCLGET(m, M_DONTWAIT);
if (__predict_false((m->m_flags & M_EXT) == 0)) {
m_freem(m);
goto mbuf_fail;
}
if (__predict_false(m->m_len > MCLBYTES)) {
/* one more mbuf necessary */
struct mbuf *mn;
MGET(mn, M_DONTWAIT, MT_DATA);
if (__predict_false(mn == NULL)) {
m_freem(m);
goto mbuf_fail;
}
if (m->m_len - MCLBYTES > MLEN) {
MCLGET(mn, M_DONTWAIT);
if ((mn->m_flags & M_EXT) == 0) {
m_freem(mn);
m_freem(m);
goto mbuf_fail;
}
}
mn->m_len = m->m_len - MCLBYTES;
m->m_len = MCLBYTES;
m->m_next = mn;
KASSERT(mn->m_len <= MCLBYTES);
}
KASSERT(m->m_len <= MCLBYTES);
}
if (m0 || (txreq.flags & NETTXF_more_data)) {
if (m0 == NULL) {
m0 = m;
mlast = (m->m_next) ? m->m_next : m;
KASSERT(mlast->m_next == NULL);
} else {
/* Coalesce like m_cat(), but without copy */
KASSERT(mlast != NULL);
if (M_TRAILINGSPACE(mlast) >= m->m_pkthdr.len) {
mlast->m_len += m->m_pkthdr.len;
m_freem(m);
} else {
mlast->m_next = m;
mlast = (m->m_next) ? m->m_next : m;
KASSERT(mlast->m_next == NULL);
}
}
}
XENPRINTF(("%s pkt offset %d size %d id %d req_cons %d\n",
xneti->xni_if.if_xname, txreq.offset,
txreq.size, txreq.id, req_cons & (RING_SIZE(&xneti->xni_txring) - 1)));
xst = &xneti->xni_xstate[queued];
xst->xs_m = (m0 == NULL || m == m0) ? m : NULL;
xst->xs_tx = txreq;
/* Fill the length of _this_ fragment */
xst->xs_tx_size = (m == m0) ? m0_len : m->m_pkthdr.len;
queued++;
KASSERT(queued <= NB_XMIT_PAGES_BATCH);
if (__predict_false(m0 &&
(txreq.flags & NETTXF_more_data) == 0)) {
/* Last fragment, stop appending mbufs */
m0 = NULL;
}
if (queued == NB_XMIT_PAGES_BATCH) {
KASSERT(m0 == NULL);
xennetback_tx_copy_process(ifp, xneti, queued);
queued = 0;
}
}
xen_wmb();
RING_FINAL_CHECK_FOR_REQUESTS(&xneti->xni_txring, receive_pending);
if (receive_pending)
goto again;
xneti->xni_txring.req_cons = req_cons;
if (m0) {
/* Queue empty, and still unfinished multi-fragment request */
printf("%s: dropped unfinished multi-fragment\n",
ifp->if_xname);
xennetback_tx_copy_abort(ifp, xneti, queued);
queued = 0;
m0 = NULL;
}
if (queued > 0)
xennetback_tx_copy_process(ifp, xneti, queued);
/* check to see if we can transmit more packets */
if_schedule_deferred_start(ifp);
return 1;
}
static int
xennetback_ifioctl(struct ifnet *ifp, u_long cmd, void *data)
{
//struct xnetback_instance *xneti = ifp->if_softc;
//struct ifreq *ifr = (struct ifreq *)data;
int s, error;
s = splnet();
error = ether_ioctl(ifp, cmd, data);
if (error == ENETRESET)
error = 0;
splx(s);
return error;
}
static void
xennetback_ifstart(struct ifnet *ifp)
{
struct xnetback_instance *xneti = ifp->if_softc;
/*
* The Xen communication channel is much more efficient if we can
* schedule batch of packets for the domain. Deferred start by network
* stack will enqueue all pending mbufs in the interface's send queue
* before it is processed by the soft interrupt handler.
*/
xennetback_ifsoftstart_copy(xneti);
}
static void
xennetback_rx_copy_process(struct ifnet *ifp, struct xnetback_instance *xneti,
int queued, int copycnt)
{
int notify;
struct xnetback_xstate *xst;
if (xennetback_copy(ifp, xneti->xni_gop_copy, copycnt, "Rx") != 0) {
/* message already displayed */
goto free_mbufs;
}
/* update pointer */
xneti->xni_rxring.req_cons += queued;
xneti->xni_rxring.rsp_prod_pvt += queued;
RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xneti->xni_rxring, notify);
/* send event */
if (notify) {
xen_wmb();
XENPRINTF(("%s receive event\n",
xneti->xni_if.if_xname));
hypervisor_notify_via_evtchn(xneti->xni_evtchn);
}
free_mbufs:
/* now that data was copied we can free the mbufs */
for (int j = 0; j < queued; j++) {
xst = &xneti->xni_xstate[j];
if (xst->xs_loaded) {
bus_dmamap_unload(xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap);
xst->xs_loaded = false;
}
if (xst->xs_m != NULL) {
m_freem(xst->xs_m);
xst->xs_m = NULL;
}
}
}
static void
xennetback_rx_copy_queue(struct xnetback_instance *xneti,
struct xnetback_xstate *xst0, int rsp_prod_pvt, int *queued, int *copycntp)
{
struct xnetback_xstate *xst = xst0;
gnttab_copy_t *gop;
struct netif_rx_request rxreq;
netif_rx_response_t *rxresp;
paddr_t ma;
size_t goff, segoff, segsize, take, totsize;
int copycnt = *copycntp, reqcnt = *queued;
const bus_dmamap_t dm = xst0->xs_dmamap;
const bool multiseg = (dm->dm_nsegs > 1);
KASSERT(xst0 == &xneti->xni_xstate[reqcnt]);
RING_COPY_REQUEST(&xneti->xni_rxring,
xneti->xni_rxring.req_cons + reqcnt, &rxreq);
goff = 0;
rxresp = RING_GET_RESPONSE(&xneti->xni_rxring, rsp_prod_pvt + reqcnt);
reqcnt++;
rxresp->id = rxreq.id;
rxresp->offset = 0;
if ((xst0->xs_m->m_pkthdr.csum_flags & XN_M_CSUM_SUPPORTED) != 0) {
rxresp->flags = NETRXF_csum_blank;
} else {
rxresp->flags = NETRXF_data_validated;
}
if (multiseg)
rxresp->flags |= NETRXF_more_data;
totsize = xst0->xs_m->m_pkthdr.len;
/*
* Arrange for the mbuf contents to be copied into one or more
* provided memory pages.
*/
for (int seg = 0; seg < dm->dm_nsegs; seg++) {
ma = dm->dm_segs[seg].ds_addr;
segsize = dm->dm_segs[seg].ds_len;
segoff = 0;
while (segoff < segsize) {
take = uimin(PAGE_SIZE - goff, segsize - segoff);
KASSERT(take <= totsize);
/* add copy request */
gop = &xneti->xni_gop_copy[copycnt++];
gop->flags = GNTCOPY_dest_gref;
gop->source.offset = (ma & PAGE_MASK) + segoff;
gop->source.domid = DOMID_SELF;
gop->source.u.gmfn = ma >> PAGE_SHIFT;
gop->dest.u.ref = rxreq.gref;
gop->dest.offset = goff;
gop->dest.domid = xneti->xni_domid;
gop->len = take;
segoff += take;
goff += take;
totsize -= take;
if (goff == PAGE_SIZE && totsize > 0) {
rxresp->status = goff;
/* Take next grant */
RING_COPY_REQUEST(&xneti->xni_rxring,
xneti->xni_rxring.req_cons + reqcnt,
&rxreq);
goff = 0;
rxresp = RING_GET_RESPONSE(&xneti->xni_rxring,
rsp_prod_pvt + reqcnt);
reqcnt++;
rxresp->id = rxreq.id;
rxresp->offset = 0;
rxresp->flags = NETRXF_more_data;
xst++;
xst->xs_m = NULL;
}
}
}
rxresp->flags &= ~NETRXF_more_data;
rxresp->status = goff;
KASSERT(totsize == 0);
KASSERT(copycnt > *copycntp);
KASSERT(reqcnt > *queued);
*copycntp = copycnt;
*queued = reqcnt;
}
static void
xennetback_ifsoftstart_copy(struct xnetback_instance *xneti)
{
struct ifnet *ifp = &xneti->xni_if;
struct mbuf *m;
int queued = 0;
RING_IDX req_prod, rsp_prod_pvt;
struct xnetback_xstate *xst;
int copycnt = 0;
bool abort;
XENPRINTF(("xennetback_ifsoftstart_copy "));
int s = splnet();
if (__predict_false((ifp->if_flags & IFF_RUNNING) == 0)) {
splx(s);
return;
}
while (!IFQ_IS_EMPTY(&ifp->if_snd)) {
XENPRINTF(("pkt\n"));
req_prod = xneti->xni_rxring.sring->req_prod;
rsp_prod_pvt = xneti->xni_rxring.rsp_prod_pvt;
xen_rmb();
abort = false;
KASSERT(queued == 0);
KASSERT(copycnt == 0);
while (copycnt < NB_XMIT_PAGES_BATCH) {
#define XN_RING_FULL(cnt) \
req_prod == xneti->xni_rxring.req_cons + (cnt) || \
xneti->xni_rxring.req_cons - (rsp_prod_pvt + cnt) == \
NET_RX_RING_SIZE
if (__predict_false(XN_RING_FULL(1))) {
/* out of ring space */
XENPRINTF(("xennetback_ifstart: ring full "
"req_prod 0x%x req_cons 0x%x rsp_prod_pvt "
"0x%x\n",
req_prod,
xneti->xni_rxring.req_cons + queued,
rsp_prod_pvt + queued));
abort = true;
break;
}
IFQ_DEQUEUE(&ifp->if_snd, m);
if (m == NULL)
break;
again:
xst = &xneti->xni_xstate[queued];
/*
* For short packets it's always way faster passing
* single defragmented packet, even with feature-sg.
* Try to defragment first if the result is likely
* to fit into a single mbuf.
*/
if (m->m_pkthdr.len < MCLBYTES && m->m_next)
(void)m_defrag(m, M_DONTWAIT);
if (bus_dmamap_load_mbuf(
xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap, m, BUS_DMA_NOWAIT) != 0) {
if (m_defrag(m, M_DONTWAIT) == NULL) {
m_freem(m);
static struct timeval lasttime;
if (ratecheck(&lasttime, &xni_pool_errintvl))
printf("%s: fail defrag mbuf\n",
ifp->if_xname);
continue;
}
if (__predict_false(bus_dmamap_load_mbuf(
xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap, m, BUS_DMA_NOWAIT) != 0)) {
printf("%s: cannot load mbuf\n",
ifp->if_xname);
m_freem(m);
continue;
}
}
KASSERT(xst->xs_dmamap->dm_nsegs < NB_XMIT_PAGES_BATCH);
KASSERTMSG(queued <= copycnt, "queued %d > copycnt %d",
queued, copycnt);
if (__predict_false(XN_RING_FULL(
xst->xs_dmamap->dm_nsegs))) {
/* Ring too full to fit the packet */
bus_dmamap_unload(xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap);
m_freem(m);
abort = true;
break;
}
if (__predict_false(copycnt + xst->xs_dmamap->dm_nsegs >
NB_XMIT_PAGES_BATCH)) {
/* Batch already too full, flush and retry */
bus_dmamap_unload(xneti->xni_xbusd->xbusd_dmat,
xst->xs_dmamap);
xennetback_rx_copy_process(ifp, xneti, queued,
copycnt);
queued = copycnt = 0;
goto again;
}
/* Now committed to send */
xst->xs_loaded = true;
xst->xs_m = m;
xennetback_rx_copy_queue(xneti, xst,
rsp_prod_pvt, &queued, ©cnt);
if_statinc(ifp, if_opackets);
bpf_mtap(ifp, m, BPF_D_OUT);
}
KASSERT(copycnt <= NB_XMIT_PAGES_BATCH);
KASSERT(queued <= copycnt);
if (copycnt > 0) {
xennetback_rx_copy_process(ifp, xneti, queued, copycnt);
queued = copycnt = 0;
}
/*
* note that we don't use RING_FINAL_CHECK_FOR_REQUESTS()
* here, as the frontend doesn't notify when adding
* requests anyway
*/
if (__predict_false(abort ||
!RING_HAS_UNCONSUMED_REQUESTS(&xneti->xni_rxring))) {
/* ring full */
ifp->if_timer = 1;
break;
}
}
splx(s);
}
static void
xennetback_ifwatchdog(struct ifnet * ifp)
{
/*
* We can get to the following condition: transmit stalls because the
* ring is full when the ifq is full too.
*
* In this case (as, unfortunately, we don't get an interrupt from xen
* on transmit) nothing will ever call xennetback_ifstart() again.
* Here we abuse the watchdog to get out of this condition.
*/
XENPRINTF(("xennetback_ifwatchdog\n"));
xennetback_ifstart(ifp);
}
static int
xennetback_ifinit(struct ifnet *ifp)
{
struct xnetback_instance *xneti = ifp->if_softc;
int s = splnet();
if ((ifp->if_flags & IFF_UP) == 0) {
splx(s);
return 0;
}
if (xneti->xni_status == CONNECTED)
ifp->if_flags |= IFF_RUNNING;
splx(s);
return 0;
}
static void
xennetback_ifstop(struct ifnet *ifp, int disable)
{
struct xnetback_instance *xneti = ifp->if_softc;
int s = splnet();
ifp->if_flags &= ~IFF_RUNNING;
ifp->if_timer = 0;
if (xneti->xni_status == CONNECTED) {
xennetback_evthandler(ifp->if_softc); /* flush pending RX requests */
}
splx(s);
}