diff --git a/hdl/common/network_stack.sv b/hdl/common/network_stack.sv
index d26b0f61d1e500b12100f1da2b0939c323a338b1..54f80b0a562301ff53bfc45ed97e1682751a8ce2 100755
--- a/hdl/common/network_stack.sv
+++ b/hdl/common/network_stack.sv
@@ -392,7 +392,8 @@ assign axis_ipv6_to_intercon.last = 1'b0;
 
 
 roce_stack #(
-    .ROCE_EN(ROCE_EN)
+    .ROCE_EN(ROCE_EN),
+    .WIDTH(WIDTH)
 ) rocev2_stack_inst(
     .net_clk(net_clk), // input aclk
     .net_aresetn(net_aresetn), // input aresetn
@@ -407,7 +408,7 @@ roce_stack #(
     
    //TX
     .s_axis_tx_meta(axis_tx_metadata),
-    .s_axis_tx_data(axis_tx_data),
+    .s_axis_tx_data(s_axis_roce_role_tx_data),
     
 `ifdef IP_VERSION4
     // IPv4
@@ -425,9 +426,9 @@ roce_stack #(
     
     .m_axis_mem_read_cmd(m_axis_roce_read_cmd),
     // Memory Write
-    .m_axis_mem_write_data(axis_roce_write_data),
+    .m_axis_mem_write_data(m_axis_roce_write_data),
     // Memory Read
-    .s_axis_mem_read_data(axis_roce_read_data),
+    .s_axis_mem_read_data(s_axis_roce_read_data),
     // Memory Write Status
     //.s_axis_mem_write_status_TVALID(s_axis_rxwrite_sts_TVALID),
     //.s_axis_mem_write_status_TREADY(s_axis_rxwrite_sts_TREADY),
@@ -1512,84 +1513,6 @@ axis_interconnect_merger_160 tx_metadata_merger (
 );
 
 
-/*
- * Width alignment
- */
-axi_stream #(.WIDTH(WIDTH) )   axis_roce_read_data();
-axi_stream #(.WIDTH(WIDTH) )   axis_roce_write_data();
-axi_stream #(.WIDTH(WIDTH) )  axis_tx_data();
-generate
-if (WIDTH==64) begin
-//TODO move
-//RoCE Data Path
-axis_512_to_64_converter roce_read_data_converter (
-  .aclk(net_clk),                    // input wire aclk
-  .aresetn(net_aresetn),              // input wire aresetn
-  .s_axis_tvalid(s_axis_roce_read_data.valid),  // input wire s_axis_tvalid
-  .s_axis_tready(s_axis_roce_read_data.ready),  // output wire s_axis_tready
-  .s_axis_tdata(s_axis_roce_read_data.data),    // input wire [63 : 0] s_axis_tdata
-  .s_axis_tkeep(s_axis_roce_read_data.keep),    // input wire [7 : 0] s_axis_tkeep
-  .s_axis_tlast(s_axis_roce_read_data.last),    // input wire s_axis_tlast
-  .m_axis_tvalid(axis_roce_read_data.valid),  // output wire m_axis_tvalid
-  .m_axis_tready(axis_roce_read_data.ready),  // input wire m_axis_tready
-  .m_axis_tdata(axis_roce_read_data.data),    // output wire [511 : 0] m_axis_tdata
-  .m_axis_tkeep(axis_roce_read_data.keep),    // output wire [63 : 0] m_axis_tkeep
-  .m_axis_tlast(axis_roce_read_data.last)    // output wire m_axis_tlast
-);
-
-axis_512_to_64_converter roce_tx_data_converter (
-  .aclk(net_clk),                    // input wire aclk
-  .aresetn(net_aresetn),              // input wire aresetn
-  .s_axis_tvalid(s_axis_roce_role_tx_data.valid),  // input wire s_axis_tvalid
-  .s_axis_tready(s_axis_roce_role_tx_data.ready),  // output wire s_axis_tready
-  .s_axis_tdata(s_axis_roce_role_tx_data.data),    // input wire [63 : 0] s_axis_tdata
-  .s_axis_tkeep(s_axis_roce_role_tx_data.keep),    // input wire [7 : 0] s_axis_tkeep
-  .s_axis_tlast(s_axis_roce_role_tx_data.last),    // input wire s_axis_tlast
-  .m_axis_tvalid(axis_tx_data.valid),  // output wire m_axis_tvalid
-  .m_axis_tready(axis_tx_data.ready),  // input wire m_axis_tready
-  .m_axis_tdata(axis_tx_data.data),    // output wire [511 : 0] m_axis_tdata
-  .m_axis_tkeep(axis_tx_data.keep),    // output wire [63 : 0] m_axis_tkeep
-  .m_axis_tlast(axis_tx_data.last)    // output wire m_axis_tlast
-);
-
-axis_64_to_512_converter roce_write_data_converter (
-  .aclk(net_clk),                    // input wire aclk
-  .aresetn(net_aresetn),              // input wire aresetn
-  .s_axis_tvalid(axis_roce_write_data.valid),  // input wire s_axis_tvalid
-  .s_axis_tready(axis_roce_write_data.ready),  // output wire s_axis_tready
-  .s_axis_tdata(axis_roce_write_data.data),    // input wire [63 : 0] s_axis_tdata
-  .s_axis_tkeep(axis_roce_write_data.keep),    // input wire [7 : 0] s_axis_tkeep
-  .s_axis_tlast(axis_roce_write_data.last),    // input wire s_axis_tlast
-  .s_axis_tdest(axis_roce_write_data.dest),    // input wire s_axis_tlast
-  .m_axis_tvalid(m_axis_roce_write_data.valid),  // output wire m_axis_tvalid
-  .m_axis_tready(m_axis_roce_write_data.ready),  // input wire m_axis_tready
-  .m_axis_tdata(m_axis_roce_write_data.data),    // output wire [511 : 0] m_axis_tdata
-  .m_axis_tkeep(m_axis_roce_write_data.keep),    // output wire [63 : 0] m_axis_tkeep
-  .m_axis_tlast(m_axis_roce_write_data.last),    // output wire m_axis_tlast
-  .m_axis_tdest(m_axis_roce_write_data.dest)    // output wire m_axis_tlast
-);
-end
-if (WIDTH==512) begin
-//RoCE Data Path
-assign axis_roce_read_data.valid = s_axis_roce_read_data.valid; 
-assign s_axis_roce_read_data.ready = axis_roce_read_data.ready;
-assign axis_roce_read_data.data = s_axis_roce_read_data.data;
-assign axis_roce_read_data.keep = s_axis_roce_read_data.keep;
-assign axis_roce_read_data.last = s_axis_roce_read_data.last;
-
-assign axis_tx_data.valid = s_axis_roce_role_tx_data.valid;
-assign s_axis_roce_role_tx_data.ready = axis_tx_data.ready;
-assign axis_tx_data.data = s_axis_roce_role_tx_data.data;
-assign axis_tx_data.keep = s_axis_roce_role_tx_data.keep;
-assign axis_tx_data.last = s_axis_roce_role_tx_data.last;
-
-assign m_axis_roce_write_data.valid = axis_roce_write_data.valid;
-assign axis_roce_write_data.ready = m_axis_roce_write_data.ready;
-assign m_axis_roce_write_data.data = axis_roce_write_data.data;
-assign m_axis_roce_write_data.keep = axis_roce_write_data.keep;
-assign m_axis_roce_write_data.last = axis_roce_write_data.last;
-end
-endgenerate
 /*
  * Statistics
  */
diff --git a/hdl/common/roce_stack.sv b/hdl/common/roce_stack.sv
index 9e80bbea4dc43eaccc44da3cc803541289ac7357..67a9fdb8b06dbb1dd14208f1d036043f6d6c9bdf 100755
--- a/hdl/common/roce_stack.sv
+++ b/hdl/common/roce_stack.sv
@@ -32,7 +32,8 @@
 //`define POINTER_CHASING
 
 module roce_stack #(
-    parameter ROCE_EN = 1
+    parameter ROCE_EN = 1,
+    parameter WIDTH = 64
 )(
     input wire          net_clk,
     input wire          net_aresetn,
@@ -103,11 +104,11 @@ rocev2_ip rocev2_inst(
     .s_axis_tx_meta_V_TVALID(s_axis_tx_meta.valid),
     .s_axis_tx_meta_V_TREADY(s_axis_tx_meta.ready),
     .s_axis_tx_meta_V_TDATA(s_axis_tx_meta.data),
-    .s_axis_tx_data_TVALID(s_axis_tx_data.valid),
-    .s_axis_tx_data_TREADY(s_axis_tx_data.ready),
-    .s_axis_tx_data_TDATA(s_axis_tx_data.data),
-    .s_axis_tx_data_TKEEP(s_axis_tx_data.keep),
-    .s_axis_tx_data_TLAST(s_axis_tx_data.last),
+    .s_axis_tx_data_TVALID(axis_tx_data.valid),
+    .s_axis_tx_data_TREADY(axis_tx_data.ready),
+    .s_axis_tx_data_TDATA(axis_tx_data.data),
+    .s_axis_tx_data_TKEEP(axis_tx_data.keep),
+    .s_axis_tx_data_TLAST(axis_tx_data.last),
     
     // IPv4
     .m_axis_tx_data_TVALID(m_axis_tx_data.valid),
@@ -127,18 +128,18 @@ rocev2_ip rocev2_inst(
     .m_axis_mem_read_cmd_TDATA(m_axis_mem_read_cmd.data),
     .m_axis_mem_read_cmd_TDEST(m_axis_mem_read_cmd.dest),
     // Memory Write
-    .m_axis_mem_write_data_TVALID(m_axis_mem_write_data.valid),
-    .m_axis_mem_write_data_TREADY(m_axis_mem_write_data.ready),
-    .m_axis_mem_write_data_TDATA(m_axis_mem_write_data.data),
-    .m_axis_mem_write_data_TKEEP(m_axis_mem_write_data.keep),
-    .m_axis_mem_write_data_TLAST(m_axis_mem_write_data.last),
-    .m_axis_mem_write_data_TDEST(m_axis_mem_write_data.dest),
+    .m_axis_mem_write_data_TVALID(axis_mem_write_data.valid),
+    .m_axis_mem_write_data_TREADY(axis_mem_write_data.ready),
+    .m_axis_mem_write_data_TDATA(axis_mem_write_data.data),
+    .m_axis_mem_write_data_TKEEP(axis_mem_write_data.keep),
+    .m_axis_mem_write_data_TLAST(axis_mem_write_data.last),
+    .m_axis_mem_write_data_TDEST(axis_mem_write_data.dest),
     // Memory Read
-    .s_axis_mem_read_data_TVALID(s_axis_mem_read_data.valid),
-    .s_axis_mem_read_data_TREADY(s_axis_mem_read_data.ready),
-    .s_axis_mem_read_data_TDATA(s_axis_mem_read_data.data),
-    .s_axis_mem_read_data_TKEEP(s_axis_mem_read_data.keep),
-    .s_axis_mem_read_data_TLAST(s_axis_mem_read_data.last),
+    .s_axis_mem_read_data_TVALID(axis_mem_read_data.valid),
+    .s_axis_mem_read_data_TREADY(axis_mem_read_data.ready),
+    .s_axis_mem_read_data_TDATA(axis_mem_read_data.data),
+    .s_axis_mem_read_data_TKEEP(axis_mem_read_data.keep),
+    .s_axis_mem_read_data_TLAST(axis_mem_read_data.last),
     // Memory Write Status
     //.s_axis_mem_write_status_TVALID(s_axis_rxwrite_sts_TVALID),
     //.s_axis_mem_write_status_TREADY(s_axis_rxwrite_sts_TREADY),
@@ -169,6 +170,84 @@ rocev2_ip rocev2_inst(
     .regInvalidPsnDropCount_V_ap_vld(psn_drop_pkg_count_valid)
 );
 
+/*
+ * Width alignment
+ */
+axi_stream #(.WIDTH(WIDTH) )   axis_mem_read_data();
+axi_stream #(.WIDTH(WIDTH) )   axis_mem_write_data();
+axi_stream #(.WIDTH(WIDTH) )   axis_tx_data();
+//generate
+if (WIDTH==64) begin
+//RoCE Data Path
+axis_512_to_64_converter roce_read_data_converter (
+  .aclk(net_clk),                    // input wire aclk
+  .aresetn(net_aresetn),              // input wire aresetn
+  .s_axis_tvalid(s_axis_mem_read_data.valid),  // input wire s_axis_tvalid
+  .s_axis_tready(s_axis_mem_read_data.ready),  // output wire s_axis_tready
+  .s_axis_tdata(s_axis_mem_read_data.data),    // input wire [63 : 0] s_axis_tdata
+  .s_axis_tkeep(s_axis_mem_read_data.keep),    // input wire [7 : 0] s_axis_tkeep
+  .s_axis_tlast(s_axis_mem_read_data.last),    // input wire s_axis_tlast
+  .m_axis_tvalid(axis_mem_read_data.valid),  // output wire m_axis_tvalid
+  .m_axis_tready(axis_mem_read_data.ready),  // input wire m_axis_tready
+  .m_axis_tdata(axis_mem_read_data.data),    // output wire [511 : 0] m_axis_tdata
+  .m_axis_tkeep(axis_mem_read_data.keep),    // output wire [63 : 0] m_axis_tkeep
+  .m_axis_tlast(axis_mem_read_data.last)    // output wire m_axis_tlast
+);
+
+axis_512_to_64_converter roce_tx_data_converter (
+  .aclk(net_clk),                    // input wire aclk
+  .aresetn(net_aresetn),              // input wire aresetn
+  .s_axis_tvalid(s_axis_roce_role_tx_data.valid),  // input wire s_axis_tvalid
+  .s_axis_tready(s_axis_roce_role_tx_data.ready),  // output wire s_axis_tready
+  .s_axis_tdata(s_axis_roce_role_tx_data.data),    // input wire [63 : 0] s_axis_tdata
+  .s_axis_tkeep(s_axis_roce_role_tx_data.keep),    // input wire [7 : 0] s_axis_tkeep
+  .s_axis_tlast(s_axis_tx_data.last),    // input wire s_axis_tlast
+  .m_axis_tvalid(axis_tx_data.valid),  // output wire m_axis_tvalid
+  .m_axis_tready(axis_tx_data.ready),  // input wire m_axis_tready
+  .m_axis_tdata(axis_tx_data.data),    // output wire [511 : 0] m_axis_tdata
+  .m_axis_tkeep(axis_tx_data.keep),    // output wire [63 : 0] m_axis_tkeep
+  .m_axis_tlast(axis_tx_data.last)    // output wire m_axis_tlast
+);
+
+axis_64_to_512_converter roce_write_data_converter (
+  .aclk(net_clk),                    // input wire aclk
+  .aresetn(net_aresetn),              // input wire aresetn
+  .s_axis_tvalid(axis_mem_write_data.valid),  // input wire s_axis_tvalid
+  .s_axis_tready(axis_mem_write_data.ready),  // output wire s_axis_tready
+  .s_axis_tdata(axis_mem_write_data.data),    // input wire [63 : 0] s_axis_tdata
+  .s_axis_tkeep(axis_mem_write_data.keep),    // input wire [7 : 0] s_axis_tkeep
+  .s_axis_tlast(axis_mem_write_data.last),    // input wire s_axis_tlast
+  .s_axis_tdest(axis_mem_write_data.dest),    // input wire s_axis_tlast
+  .m_axis_tvalid(m_axis_mem_write_data.valid),  // output wire m_axis_tvalid
+  .m_axis_tready(m_axis_mem_write_data.ready),  // input wire m_axis_tready
+  .m_axis_tdata(m_axis_mem_write_data.data),    // output wire [511 : 0] m_axis_tdata
+  .m_axis_tkeep(m_axis_mem_write_data.keep),    // output wire [63 : 0] m_axis_tkeep
+  .m_axis_tlast(m_axis_mem_write_data.last),    // output wire m_axis_tlast
+  .m_axis_tdest(m_axis_mem_write_data.dest)    // output wire m_axis_tlast
+);
+end
+if (WIDTH==512) begin
+//RoCE Data Path
+assign axis_mem_read_data.valid = s_axis_mem_read_data.valid;
+assign s_axis_mem_read_data.ready = axis_mem_read_data.ready;
+assign axis_mem_read_data.data = s_axis_mem_read_data.data;
+assign axis_mem_read_data.keep = s_axis_mem_read_data.keep;
+assign axis_mem_read_data.last = s_axis_mem_read_data.last;
+
+assign axis_tx_data.valid = s_axis_tx_data.valid;
+assign s_axis_tx_data.ready = axis_tx_data.ready;
+assign axis_tx_data.data = s_axis_tx_data.data;
+assign axis_tx_data.keep = s_axis_tx_data.keep;
+assign axis_tx_data.last = s_axis_tx_data.last;
+
+assign m_axis_mem_write_data.valid = axis_mem_write_data.valid;
+assign axis_mem_write_data.ready = m_axis_mem_write_data.ready;
+assign m_axis_mem_write_data.data = axis_mem_write_data.data;
+assign m_axis_mem_write_data.keep = axis_mem_write_data.keep;
+assign m_axis_mem_write_data.last = axis_mem_write_data.last;
+end
+//endgenerate
+
 assign m_axis_rx_pcmeta.valid = 1'b0;
 assign m_axis_rx_pcmeta.data = 0;
 assign s_axis_tx_pcmeta.ready = 1'b1;
diff --git a/hls/iperf_client/iperf_client.cpp b/hls/iperf_client/iperf_client.cpp
index 485b347c7368a81d5e5b708ae60d1798b92ae263..eaafa6cddfbb0db290dc551bb73a39f1c1470742 100644
--- a/hls/iperf_client/iperf_client.cpp
+++ b/hls/iperf_client/iperf_client.cpp
@@ -1,5 +1,5 @@
 /************************************************
-Copyright (c) 2018, Systems Group, ETH Zurich.
+Copyright (c) 2019, Systems Group, ETH Zurich.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
@@ -30,15 +30,29 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "iperf_client.hpp"
 #include <iostream>
 
+//Buffers responses coming from the TCP stack
+void status_handler(hls::stream<appTxRsp>&				txStatus,
+							hls::stream<internalAppTxRsp>&	txStatusBuffer)
+{
+#pragma HLS PIPELINE II=1
+#pragma HLS INLINE off
+
+	if (!txStatus.empty())
+	{
+		appTxRsp resp = txStatus.read();
+		txStatusBuffer.write(internalAppTxRsp(resp.sessionID, resp.error));
+	}
+}
+
 template <int WIDTH>
-void client(	stream<ipTuple>&		openConnection,
-            stream<openStatus>& openConStatus,
-				stream<ap_uint<16> >&	closeConnection,
-				stream<appTxMeta>&	txMetaData,
-				stream<net_axis<WIDTH> >& txData,
-				stream<appTxRsp>&	txStatus,
-				stream<bool>&			startSignal,
-				stream<bool>&			stopSignal,
+void client(hls::stream<ipTuple>&				openConnection,
+            hls::stream<openStatus>& 			openConStatus,
+				hls::stream<ap_uint<16> >&			closeConnection,
+				hls::stream<appTxMeta>&				txMetaData,
+				hls::stream<net_axis<WIDTH> >& 	txData,
+				hls::stream<internalAppTxRsp>&	txStatus,
+				hls::stream<bool>&					startSignal,
+				hls::stream<bool>&					stopSignal,
 				ap_uint<1>		runExperiment,
 				ap_uint<1>		dualModeEn,
 				ap_uint<14>		useConn,
@@ -178,7 +192,7 @@ void client(	stream<ipTuple>&		openConnection,
 		}
 		else if (!txStatus.empty())
 		{
-			appTxRsp resp = txStatus.read();
+			internalAppTxRsp resp = txStatus.read();
 			if (resp.error == 0)
 			{
 				currentSessionID = resp.sessionID;
@@ -226,7 +240,7 @@ void client(	stream<ipTuple>&		openConnection,
 	case CHECK_REQ:
 		if (!txStatus.empty())
 		{
-			appTxRsp resp = txStatus.read();
+			internalAppTxRsp resp = txStatus.read();
 			if (resp.error == 0)
 			{
 				currentSessionID = resp.sessionID;
@@ -428,18 +442,18 @@ void clock( hls::stream<bool>&	startSignal,
 }
 
 
-void iperf_client(	stream<ap_uint<16> >& listenPort,
-					stream<bool>& listenPortStatus,
-					stream<appNotification>& notifications,
-					stream<appReadRequest>& readRequest,
-					stream<ap_uint<16> >& rxMetaData,
-					stream<net_axis<DATA_WIDTH> >& rxData,
-					stream<ipTuple>& openConnection,
-					stream<openStatus>& openConStatus,
-					stream<ap_uint<16> >& closeConnection,
-					stream<appTxMeta>& txMetaData,
-					stream<net_axis<DATA_WIDTH> >& txData,
-					stream<appTxRsp>& txStatus,
+void iperf_client(	hls::stream<ap_uint<16> >& listenPort,
+					hls::stream<bool>& listenPortStatus,
+					hls::stream<appNotification>& notifications,
+					hls::stream<appReadRequest>& readRequest,
+					hls::stream<ap_uint<16> >& rxMetaData,
+					hls::stream<net_axis<DATA_WIDTH> >& rxData,
+					hls::stream<ipTuple>& openConnection,
+					hls::stream<openStatus>& openConStatus,
+					hls::stream<ap_uint<16> >& closeConnection,
+					hls::stream<appTxMeta>& txMetaData,
+					hls::stream<net_axis<DATA_WIDTH> >& txData,
+					hls::stream<appTxRsp>& txStatus,
 					ap_uint<1>		runExperiment,
 					ap_uint<1>		dualModeEn,
 					ap_uint<14>		useConn,
@@ -509,15 +523,21 @@ void iperf_client(	stream<ap_uint<16> >& listenPort,
 	#pragma HLS STREAM variable=startSignalFifo depth=2
 	#pragma HLS STREAM variable=stopSignalFifo depth=2
 
+	//This is required to buffer up to 1024 reponses => supporting up to 1024 connections
+	static hls::stream<internalAppTxRsp>	txStatusBuffer("txStatusBuffer");
+	#pragma HLS STREAM variable=txStatusBuffer depth=1024
+
 	/*
 	 * Client
 	 */
+	status_handler(txStatus, txStatusBuffer);
+
 	client<DATA_WIDTH>(	openConnection,
 			openConStatus,
 			closeConnection,
 			txMetaData,
 			txData,
-			txStatus,
+			txStatusBuffer,
 			startSignalFifo,
 			stopSignalFifo,
 			runExperiment,
diff --git a/hls/iperf_client/iperf_client.hpp b/hls/iperf_client/iperf_client.hpp
index 1a298da5e1f9eba92ed36b1ed1fc307c2075d84e..1f9a2e0b808234f9ecf010229e3ebe37d529a5ae 100644
--- a/hls/iperf_client/iperf_client.hpp
+++ b/hls/iperf_client/iperf_client.hpp
@@ -1,5 +1,5 @@
 /************************************************
-Copyright (c) 2018, Systems Group, ETH Zurich.
+Copyright (c) 2019, Systems Group, ETH Zurich.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
@@ -82,6 +82,15 @@ public:
    }
 };
 
+struct internalAppTxRsp
+{
+	ap_uint<16>	sessionID;
+	ap_uint<2>	error;
+	internalAppTxRsp() {}
+	internalAppTxRsp(ap_uint<16> id, ap_uint<2> err)
+		:sessionID(id), error(err) {}
+};
+
 void iperf_client(	hls::stream<ap_uint<16> >& listenPort,
 					hls::stream<bool>& listenPortStatus,
 					hls::stream<appNotification>& notifications,