I have a client written in Python for a server, which functions through LAN. Some part of the algorithm uses socket reading intensively and it is executing about 3-6 times slower, than almost the same one written in C++. What solutions exist for making Python socket reading faster?
I have some simple buffering implemented, and my class for working with sockets looks like this:
import socket
import struct
class Sock():
def __init__(self):
self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.recv_buf = b''
self.send_buf = b''
def connect(self):
self.s.connect(('127.0.0.1', 6666))
def close(self):
self.s.close()
def recv(self, lngth):
while len(self.recv_buf) < lngth:
self.recv_buf += self.s.recv(lngth - len(self.recv_buf))
res = self.recv_buf[-lngth:]
self.recv_buf = self.recv_buf[:-lngth]
return res
def next_int(self):
return struct.unpack("i", self.recv(4))[0]
def next_float(self):
return struct.unpack("f", self.recv(4))[0]
def write_int(self, i):
self.send_buf += struct.pack('i', i)
def write_float(self, f):
self.send_buf += struct.pack('f', f)
def flush(self):
self.s.sendall(self.send_buf)
self.send_buf = b''
P.S.: profiling also shows that the majority of time is spent reading sockets.
Edit: Because data is received in blocks with known size, I can read the whole block at once. So I've changed my code to this:
class Sock():
def __init__(self):
self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.send_buf = b''
def connect(self):
self.s.connect(('127.0.0.1', 6666))
def close(self):
self.s.close()
def recv_prepare(self, cnt):
self.recv_buf = bytearray()
while len(self.recv_buf) < cnt:
self.recv_buf.extend(self.s.recv(cnt - len(self.recv_buf)))
self.recv_buf_i = 0
def skip_read(self, cnt):
self.recv_buf_i += cnt
def next_int(self):
self.recv_buf_i += 4
return struct.unpack("i", self.recv_buf[self.recv_buf_i - 4:self.recv_buf_i])[0]
def next_float(self):
self.recv_buf_i += 4
return struct.unpack("f", self.recv_buf[self.recv_buf_i - 4:self.recv_buf_i])[0]
def write_int(self, i):
self.send_buf += struct.pack('i', i)
def write_float(self, f):
self.send_buf += struct.pack('f', f)
def flush(self):
self.s.sendall(self.send_buf)
self.send_buf = b''
recv
'ing from socket looks optimal in this code. But now next_int
and next_float
became the second bottleneck, they take about 1 msec (3000 CPU cycles) per call just to unpack. Is it possible to make them faster, like in C++?
Your latest bottleneck is in next_int
and next_float
because you create intermediate strings from the bytearray
and because you only unpack one value at a time.
The struct
module has an unpack_from
that takes a buffer and an offset. This is more efficient because there is no need to create an intermediate string from your bytearray
:
def next_int(self):
self.recv_buf_i += 4
return struct.unpack_from("i", self.recv_buf, self.recv_buf_i-4)[0]
Additionally, struct
module can unpack more than one value at a time. Currently, you call from Python to C (via the module) for each value. You would be better served by calling it fewer times and letting it do more work on each call:
def next_chunk(self, fmt): # fmt can be a group such as "iifff"
sz = struct.calcsize(fmt)
self.recv_buf_i += sz
return struct.unpack_from(fmt, self.recv_buf, self.recv_buf_i-sz)
If you know that fmt
will always be 4 byte integers and floats you can replace struct.calcsize(fmt)
with 4 * len(fmt)
.
Finally, as a matter of preference I think this reads more cleanly:
def next_chunk(self, fmt):
sz = struct.calcsize(fmt)
chunk = struct.unpack_from(fmt, self.recv_buf, self.recv_buf_i)
self.recv_buf_i += sz
return chunk