Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add (G)ARP sender to fix switch broadcast storms #413

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
178 changes: 170 additions & 8 deletions apps/pkt-gen/pkt-gen.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@
#endif
#include <ifaddrs.h> /* getifaddrs */
#include <net/ethernet.h>
#include <net/if_arp.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
#include <netinet/ip6.h>
#include <netinet/if_ether.h>
#ifdef linux
#define IPV6_VERSION 0x60
#define IPV6_DEFHLIM 64
Expand Down Expand Up @@ -142,6 +144,7 @@ ether_ntoa(const struct ether_addr *n)
#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */
#include <linux/ethtool.h>
#include <linux/sockios.h>
#include <linux/if.h>

#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
#include <netinet/ether.h> /* ether_aton */
Expand Down Expand Up @@ -211,6 +214,7 @@ struct pkt {
struct udphdr udp;
uint8_t body[MAX_BODYSIZE]; /* hardwired */
} ipv6;
struct ether_arp arp4;
};
} __attribute__((__packed__));

Expand Down Expand Up @@ -295,6 +299,7 @@ struct glob_arg {
int td_type;
void *mmap_addr;
char ifname[MAX_IFNAMELEN];
char phyname[MAX_IFNAMELEN];
char *nmr_config;
int dummy_send;
int virt_header; /* send also the virt_header */
Expand All @@ -305,6 +310,7 @@ struct glob_arg {
int win_idx;
int64_t win[STATS_WIN];
int wait_link;
int garp;
};
enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };

Expand All @@ -329,6 +335,7 @@ struct targ {
int me;
pthread_t thread;
int affinity;
int garp;

struct pkt pkt;
void *frame;
Expand Down Expand Up @@ -654,6 +661,42 @@ source_hwaddr(const char *ifname, char *buf)
}


/*
* getifaddrs() is the easiest way to get full link state (on Linux anyway),
* because ifa_flags includes IFF_LOWER_UP. Contrary to netdevice(7) docs,
* the value is too large to fit into ifr_flags from ioctl(SIOCGIFFLAGS).
*/
#ifndef IFF_LOWER_UP
#define IFF_LOWER_UP 0
#endif

static int
check_link_up(const char *ifname)
{
struct ifaddrs *ifaphead, *ifap;
int flags = 0, mask = IFF_UP|IFF_RUNNING|IFF_LOWER_UP;

if (getifaddrs(&ifaphead) != 0) {
D("getifaddrs %s failed", ifname);
return (-1);
}

for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
struct sockaddr_dl *sdl =
(struct sockaddr_dl*)ifap->ifa_addr;

if (!sdl || sdl->sdl_family != AF_LINK)
continue;
if (!strncmp(ifap->ifa_name, ifname, IFNAMSIZ))
break;
}
if (ifap) flags = ifap->ifa_flags & mask;
freeifaddrs(ifaphead);

return (flags == mask) ? 1 : 0;
}


/* set the thread affinity. */
static int
setaffinity(pthread_t me, int i)
Expand Down Expand Up @@ -1115,7 +1158,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
int size, struct glob_arg *g, u_int count, int options,
u_int nfrags)
{
u_int n, sent, cur = ring->cur;
u_int n, sent, vh = g->virt_header, cur = ring->cur;
u_int fcnt;

n = nm_ring_space(ring);
Expand All @@ -1140,15 +1183,19 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
for (fcnt = nfrags, sent = 0; sent < count; sent++) {
struct netmap_slot *slot = &ring->slot[cur];
char *p = NETMAP_BUF(ring, slot->buf_idx);
int buf_changed = slot->flags & NS_BUF_CHANGED;
struct pkt *old = (struct pkt*)(p - sizeof(old->vh) + vh);
int copy = options & OPT_COPY || slot->flags & NS_BUF_CHANGED;
/* sender_body() drops OPT_COPY after starting, but we need to
* copy over any ARP packets lingering in the txring */
copy |= (old->eh.ether_type == htons(ETHERTYPE_ARP));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is likely a cache miss, I would do that only if -G is specified

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually worse than it looks... Only thread 0 can have (targ->garp != 0) but here all TX threads are checking for no reason. Let me improve this.

(My initial implementation used the "sem[]" area in the netmap_ring to store a dirty flag, for when the ring needs to be checked. It works fine but it's fiddly and I don't think people would like it.)


slot->flags = 0;
if (options & OPT_RUBBISH) {
/* do nothing */
} else if (options & OPT_INDIRECT) {
slot->flags |= NS_INDIRECT;
slot->ptr = (uint64_t)((uintptr_t)frame);
} else if ((options & OPT_COPY) || buf_changed) {
} else if (copy) {
nm_pkt_copy(frame, p, size);
if (fcnt == nfrags)
update_addresses(pkt, g);
Expand Down Expand Up @@ -1177,6 +1224,77 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
return (sent);
}

/*
* Send two ARP packets: a gratuitous ARP to advertise the netmap device, and
* an ARP request directed at the remote party.
*
* Many switches require regular ARP traffic to keep the [mac:ip] association
* to the physical switch port. When the association is lost the switch will
* broadcast your pkt-gen traffic to *all* ports, killng network performance.
* This often happens during PHY reset, when switching into netmap mode.
*/
static void
send_arp(struct targ *targ)
{
struct netmap_if *nifp = targ->nmd->nifp;
struct ether_addr *src_mac = &targ->g->src_mac.start;
uint32_t src_ip = htonl(targ->g->src_ip.ipv4.start);
uint32_t dst_ip = htonl(targ->g->dst_ip.ipv4.start);
struct pkt *pkt = 0;
void *frame = 0;
int need = 2;
int virt = targ->g->virt_header;
int size = sizeof(pkt->eh) + sizeof(pkt->arp4) + virt;
if (!targ->garp) return;

for (int i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring && need; i++) {
struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
struct netmap_slot *slot;
struct ether_header *eh;
struct ether_arp *arp;
struct arphdr *hdr;
if (nm_ring_empty(ring))
continue;

slot = &ring->slot[ring->cur];
slot->len = size;
slot->ptr = 0;
frame = NETMAP_BUF(ring, slot->buf_idx);
pkt = (struct pkt*)(frame - sizeof(pkt->vh) + virt);
bzero(frame, virt);

eh = &pkt->eh;
arp = &pkt->arp4;
hdr = &arp->ea_hdr;

eh->ether_type = htons(ETHERTYPE_ARP);
memset(eh->ether_dhost, 0xff, 6);
bcopy(src_mac, eh->ether_shost, 6);
hdr->ar_hrd = htons(ARPHRD_ETHER);
hdr->ar_pro = htons(ETHERTYPE_IP);
hdr->ar_hln = 6;
hdr->ar_pln = 4;

if (need == 2) { /* first send a GARP */
hdr->ar_op = htons(ARPOP_REPLY);
bcopy(src_mac, arp->arp_sha, 6);
bcopy(src_mac, arp->arp_tha, 6);
bcopy(&dst_ip, arp->arp_spa, 4);
bcopy(&dst_ip, arp->arp_tpa, 4);
} else { /* then send an ARP request */
hdr->ar_op = htons(ARPOP_REQUEST);
bcopy(src_mac, arp->arp_sha, 6);
bzero(arp->arp_tha, 6);
bcopy(&src_ip, arp->arp_spa, 4);
bcopy(&dst_ip, arp->arp_tpa, 4);
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move the lines that are common to both if and else out of the if-else construct


ring->head = ring->cur = nm_ring_next(ring, ring->cur);
need--;
}
}


/*
* Index of the highest bit set
*/
Expand Down Expand Up @@ -1561,6 +1679,8 @@ sender_body(void *data)
} else {
int tosend = 0;
int frags = targ->g->frags;
struct timespec arptime = targ->tic;
struct timespec tmptime = { 0, 0};

nifp = targ->nmd->nifp;
while (!targ->cancel && (n == 0 || sent < n)) {
Expand All @@ -1570,6 +1690,13 @@ sender_body(void *data)
nexttime = timespec_add(nexttime, targ->g->tx_period);
wait_time(nexttime);
}
if (targ->garp) {
clock_gettime(CLOCK_REALTIME_PRECISE, &tmptime);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need to be CLOCK_REALTIME_PRECISE? COARSE will be enough, and it's faster.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the Linux manpage says COARSE is Linux-specific. I'm sure not what other supported platforms will need a work-around. How about gettimeofday?

if (tmptime.tv_sec - arptime.tv_sec >= 1) {
arptime = tmptime;
send_arp(targ);
}
}

/*
* wait for available room in the send queue(s)
Expand Down Expand Up @@ -1742,6 +1869,9 @@ receiver_body(void *data)
D("fd error");
goto quit;
}
if (targ->garp)
send_arp(targ);

RD(1, "waiting for initial packets, poll returns %d %d",
i, pfd.revents);
}
Expand Down Expand Up @@ -1769,6 +1899,9 @@ receiver_body(void *data)
#endif /* !NO_PCAP */
} else {
int dump = targ->g->options & OPT_DUMP;
struct timespec arptime = { 0, 0};
struct timespec tmptime = { 0, 0};
clock_gettime(CLOCK_REALTIME_PRECISE, &arptime);

nifp = targ->nmd->nifp;
while (!targ->cancel) {
Expand All @@ -1792,6 +1925,14 @@ receiver_body(void *data)
goto quit;
}
#endif /* !BUSYWAIT */
if (targ->garp) {
clock_gettime(CLOCK_REALTIME_PRECISE, &tmptime);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm just going along with what was already being used.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I had not seen that the _PRECISE is just a macro that maps back to CLOCK_REALTIME, which is ok.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, you just beat me to saying that

if (tmptime.tv_sec - arptime.tv_sec >= 1) {
arptime = tmptime;
send_arp(targ);
}
}

uint64_t cur_space = 0;
for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
int m;
Expand Down Expand Up @@ -2287,7 +2428,7 @@ usage(int errcode)
"\t-c cores cores to use\n"
"\t-p threads processes/threads to use\n"
"\t-T report_ms milliseconds between reports\n"
"\t-w wait_for_link_time in seconds\n"
"\t-w wait_for_link_time in seconds (default 2) (0 for auto)\n"
"\t-R rate in packets per second\n"
"\t-X dump payload\n"
"\t-H len add empty virtio-net-header with size 'len'\n"
Expand Down Expand Up @@ -2315,6 +2456,7 @@ usage(int errcode)
"\t OPT_RANDOM_SRC 512\n"
"\t OPT_RANDOM_DST 1024\n"
"\t OPT_PPS_STATS 2048\n"
"\t-G send (G)ARP announcements\n"
"\t-W exit RX with no traffic\n"
"\t-v verbose (more v = more verbose)\n"
"\t-C vale-config specify a vale config\n"
Expand Down Expand Up @@ -2401,13 +2543,27 @@ start_threads(struct glob_arg *g) {
} else {
t->affinity = -1;
}
/* only thread 0 can send ARP */
t->garp = (g->garp && t->me == 0 && g->af == AF_INET);
/* default, init packets */
initialize_packet(t);
}
/* Wait for PHY reset. */
D("Wait %d secs for phy reset", g->wait_link);
sleep(g->wait_link);
D("Ready...");
if (g->wait_link || g->dev_type != DEV_NETMAP) {
D("Wait %d secs for phy reset", g->wait_link);
sleep(g->wait_link);
D("Ready...");
} else {
D("Wait for phy reset");
for (i = 5*4; i > 0; i--) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

20?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1000/250*5

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are waiting for 5 seconds here (max)

D("Wait up to 5 seconds for phy reset")

if (check_link_up(g->phyname)) {
D("Ready...");
break;
}
usleep(250000);
}
if (!i) D("Warning: timed out...");
}

for (i = 0; i < g->nthreads; i++) {
t = &targs[i];
Expand Down Expand Up @@ -2670,9 +2826,10 @@ main(int arc, char **argv)
g.nmr_config = "";
g.virt_header = 0;
g.wait_link = 2;
g.garp = 0;

while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:"
"T:w:WvR:XC:H:e:E:m:rP:zZAh")) != -1) {
"T:w:GWvR:XC:H:e:E:m:rP:zZAh")) != -1) {

switch(ch) {
default:
Expand Down Expand Up @@ -2756,6 +2913,7 @@ main(int arc, char **argv)
} else { /* prepend netmap: */
g.dev_type = DEV_NETMAP;
sprintf(g.ifname, "netmap:%s", optarg);
sprintf(g.phyname, "%s", optarg);
}
break;

Expand Down Expand Up @@ -2788,6 +2946,10 @@ main(int arc, char **argv)
g.wait_link = atoi(optarg);
break;

case 'G':
g.garp = 1;
break;

case 'W':
g.forever = 0; /* exit RX with no traffic */
break;
Expand Down