Scraping of protected email
From the CF tag, in your supplied html, I assume you are scraping a cloudflare site. They offer a feature to obfuscate emails listed (see here) which encrypts the addresses in the HTML and using JavaScript decrypts them. Hence, using selenium you'll see email-addresses but using requests you won't.
Since the decryption method can be easily taken from the JavaScript, you can write your own decryption method in Python.
In JavaScript,
(function () {
try {
var s, a, i, j, r, c, l = document.getElementById("__cf_email__");
a = l.className;
if (a) {
s = '';
r = parseInt(a.substr(0, 2), 16);
for (j = 2; a.length - j; j += 2) {
c = parseInt(a.substr(j, 2), 16) ^ r;
s += String.fromCharCode(c);
}
s = document.createTextNode(s);
l.parentNode.replaceChild(s, l);
}
} catch (e) {}
})();
In Python,
def decodeEmail(e):
de = ""
k = int(e[:2], 16)
for i in range(2, len(e)-1, 2):
de += chr(int(e[i:i+2], 16)^k)
return de
Code In all Languages is here:
Javascript
function cfDecodeEmail(encodedString) {
var email = "", r = parseInt(encodedString.substr(0, 2), 16), n, i;
for (n = 2; encodedString.length - n; n += 2){
i = parseInt(encodedString.substr(n, 2), 16) ^ r;
email += String.fromCharCode(i);
}
return email;
}
console.log(cfDecodeEmail("543931142127353935313e352e7a373b39")); // usage
Python
def cfDecodeEmail(encodedString):
r = int(encodedString[:2],16)
email = ''.join([chr(int(encodedString[i:i+2], 16) ^ r) for i in range(2, len(encodedString), 2)])
return email
print cfDecodeEmail('543931142127353935313e352e7a373b39') # usage
PHP
function cfDecodeEmail($encodedString){
$k = hexdec(substr($encodedString,0,2));
for($i=2,$email='';$i<strlen($encodedString)-1;$i+=2){
$email.=chr(hexdec(substr($encodedString,$i,2))^$k);
}
return $email;
}
echo cfDecodeEmail('543931142127353935313e352e7a373b39'); // usage
GO
package main
import (
"bytes"
"strconv"
)
func cf(a string) (s string) {
var e bytes.Buffer
r, _ := strconv.ParseInt(a[0:2], 16, 0)
for n := 4; n < len(a)+2; n += 2 {
i, _ := strconv.ParseInt(a[n-2:n], 16, 0)
e.WriteString(string(i ^ r))
}
return e.String()
}
func main() {
email := cf("543931142127353935313e352e7a373b39") // usage
print(email)
print("\n")
}
C++
#include <iostream>
#include <string>
using namespace std;
string cfDecodeEmail(string encodedString);
int main()
{
cout << cfDecodeEmail("543931142127353935313e352e7a373b39") << endl;
}
string cfDecodeEmail(string encodedString)
{
string email;
char xorKey = stoi( encodedString.substr(0, 2), nullptr, 16);
for( unsigned i = 2; i < encodedString.length(); i += 2)
email += stoi( encodedString.substr(i, 2), nullptr, 16) ^ xorKey;
return email;
}
C#
using System;
public class Program
{
public static string cfDecodeEmail(string encodedString)
{
string email = "";
int r = Convert.ToInt32(encodedString.Substring(0, 2), 16), n, i;
for (n = 2; encodedString.Length - n > 0; n += 2)
{
i = Convert.ToInt32(encodedString.Substring(n, 2), 16) ^ r;
char character = (char)i;
email += Convert.ToString(character);
}
return email;
}
public static void Main(string[] args)
{
Console.WriteLine(cfDecodeEmail("543931142127353935313e352e7a373b39")); // usage
}
}
According to above algorithm, I wrote code in Ruby to parse [protected email] with nokogiri
def decode_email(e)
r = Integer(e[0,2], 16)
(2..e.length - 2).step(2).map do |j|
c = Integer(e[j,2], 16) ^ r
c.chr
end.join('')
end