Scrapy - simple captcha solving example
Solving the captcha itself is easy using Pillow and Python Tesseract. The hard part was to realize how to handle cookies (PHPSESSID
). Here's complete working example for your case (using Python 2):
# -*- coding: utf-8 -*-
import io
import urllib2
from PIL import Image
import pytesseract
import scrapy
class CaptchaSpider(scrapy.Spider):
name = 'captcha'
def start_requests(self):
yield scrapy.Request('http://145.100.108.148/login3/',
cookies={'PHPSESSID': 'xyz'})
def parse(self, response):
img_url = response.urljoin(response.xpath('//img/@src').extract_first())
url_opener = urllib2.build_opener()
url_opener.addheaders.append(('Cookie', 'PHPSESSID=xyz'))
img_bytes = url_opener.open(img_url).read()
img = Image.open(io.BytesIO(img_bytes))
captcha = pytesseract.image_to_string(img)
print 'Captcha solved:', captcha
return scrapy.FormRequest.from_response(
response, formdata={'captcha': captcha},
callback=self.after_captcha)
def after_captcha(self, response):
print 'Result:', response.body
here is a solution works well on straight images
best = ("https://my captcha url")
f = open('captcha.jpg','wb')
f.write(urllib.urlopen(best).read())
f.close()
import pytesseract
import cv2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
#img = Image.open('captcha.jpg')
image = cv2.imread('captcha.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
#gray = cv2.medianBlur(gray, 3)
filename = "{}.png".format("temp")
cv2.imwrite(filename, gray)
text = pytesseract.image_to_string(Image.open('temp.png'))
print text