Creating a raw HTTP request with sockets
"""
This module is a demonstration of how to send
a HTTP request from scratch with the socket module.
"""
import socket
__author__ = "Ricky L Wilson."
__email__ = "[email protected]"
"""
The term CRLF refers to Carriage Return (ASCII 13, \r)
Line Feed (ASCII 10, \n).
They're used to note the termination of a line,
however, dealt with
differently in today's popular Operating Systems.
"""
CRLF = '\r\n'
SP = ' '
CR = '\r'
HOST = 'www.example.com'
PORT = 80
PATH = '/'
def request_header(host=HOST, path=PATH):
"""
Create a request header.
"""
return CRLF.join([
"GET {} HTTP/1.1".format(path), "Host: {}".format(host),
"Connection: Close\r\n\r\n"
])
def parse_header(header):
# The response-header fields allow the server
# to pass additional information about the
# response which cannot be placed in the
# Status- Line.
# These header fields give information about
# the server and about further access to the
# resource identified by the Request-URI.
header_fields = header.split(CR)
# The first line of a Response message is the
# Status-Line, consisting of the protocol version
# followed by a numeric status code and its
# associated textual phrase, with each element
# separated by SP characters.
# Get the numeric status code from the status
# line.
code = header_fields.pop(0).split(' ')[1]
header = {}
for field in header_fields:
key, value = field.split(':', 1)
header[key.lower()] = value
return header, code
def send_request(host=HOST, path=PATH, port=PORT):
"""
Send an HTTP GET request.
"""
# Create the socket object.
"""
A network socket is an internal endpoint
for sending or receiving data within a node on
a computer network.
Concretely, it is a representation of this
endpoint in networking software (protocol stack),
such as an entry in a table
(listing communication protocol,
destination, status, etc.), and is a form of
system resource.
The term socket is analogous to physical
female connectors, communication between two
nodes through a channel being visualized as a
cable with two male connectors plugging into
sockets at each node.
Similarly, the term port (another term for a female connector)
is used for external endpoints at a node,
and the term socket is also used for an
internal endpoint of local inter-process
communication (IPC) (not over a network).
However, the analogy is limited, as network
communication need not be one-to-one or
have a dedicated communication channel.
"""
sock = socket.socket()
# Connect to the server.
sock.connect((host, port))
# Send the request.
sock.send(request_header(host, path))
# Get the response.
response = ''
chuncks = sock.recv(4096)
while chuncks:
response += chuncks
chuncks = sock.recv(4096)
# HTTP headers will be separated from the body by an empty line
header, _, body = response.partition(CRLF + CRLF)
header, code = parse_header(header)
return header, code, body
header, code, body = send_request(host='www.google.com')
print code, CRLF, body
import socket
import urlparse
CONNECTION_TIMEOUT = 5
CHUNK_SIZE = 1024
HTTP_VERSION = 1.0
CRLF = "\r\n\r\n"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks = []
while True:
chunk = sock.recv(int(chunk_size))
if chunk:
chunks.append(chunk)
else:
break
return ''.join(chunks)
def get(url, **kw):
kw.setdefault('timeout', CONNECTION_TIMEOUT)
kw.setdefault('chunk_size', CHUNK_SIZE)
kw.setdefault('http_version', HTTP_VERSION)
kw.setdefault('headers_only', False)
kw.setdefault('response_code_only', False)
kw.setdefault('body_only', False)
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(kw.get('timeout'))
sock.connect((url.netloc, url.port or 80))
msg = 'GET {0} HTTP/{1} {2}'
sock.sendall(msg.format(url.path or '/', kw.get('http_version'), CRLF))
data = receive_all(sock, chunk_size=kw.get('chunk_size'))
sock.shutdown(socket.SHUT_RDWR)
sock.close()
data = data.decode(errors='ignore')
headers = data.split(CRLF, 1)[0]
request_line = headers.split('\n')[0]
response_code = request_line.split()[1]
headers = headers.replace(request_line, '')
body = data.replace(headers, '').replace(request_line, '')
if kw['body_only']:
return body
if kw['headers_only']:
return headers
if kw['response_code_only']:
return response_code
else:
return data
print(get('http://www.google.com/'))
Yes, basically you just have to write text, something like :
GET /pageyouwant.html HTTP/1.1[CRLF]
Host: google.com[CRLF]
Connection: close[CRLF]
User-Agent: MyAwesomeUserAgent/1.0.0[CRLF]
Accept-Encoding: gzip[CRLF]
Accept-Charset: ISO-8859-1,UTF-8;q=0.7,*;q=0.7[CRLF]
Cache-Control: no-cache[CRLF]
[CRLF]
Feel free to remove / add headers at will.
Most of what you need to know is in the HTTP/1.1 spec, which you should definitely study if you want to roll your own HTTP implementation: http://www.w3.org/Protocols/rfc2616/rfc2616.html