A regex based tokenizer that extracts tokens code example

Example: A regex based tokenizer that extracts tokens

# A regex based tokenizer that extracts tokens

df = spark.createDataFrame([("A B  c",)], ["text"])
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
reTokenizer.transform(df).head()
# Row(text=u'A B  c', words=[u'a', u'b', u'c'])

# Change a parameter.
reTokneizer.setParams(outputCol="tokens").transform(df).head()
# Row(text=u'A B  c', tokens=[u'a', u'b', u'c'])

# Temporarily modify a parameter.
reTokenizer.transform(df, 
                      {reTokenizer.outputCol: "words"}).head()
# Row(text=u'A B  c', words=[u'a', u'b', u'c'])
reTokenizer.transform(df).head()
# Row(text=u'A B  c', tokens=[u'a', u'b', u'c'])

# Must use keyword arguments to specify params.
reTokenizer.setParams("text")
# Traceback (most recent call last):
#	...
# TypeError: Method setParams forces keyword arguments.
regexTokenizerPath = temp_path + "/regex-tokenizer"
reTokenizer.save(regexTokenizerPath)
loadedReTokenizer = RegexTokenizer.load(regexTokenizerPath)
loadedReTokenizer.getMinTokenLength() == reTokenizer.getMinTokenLength()
# True
loadedReTokenizer.getGaps() == reTokenizer.getGaps()
# True