R: Interface for the fasttext library

fasttext_interface {fastText}

R Documentation

Interface for the fasttext library

Description

Interface for the fasttext library

Usage

fasttext_interface(
  list_params,
  path_output = "",
  MilliSecs = 100,
  path_input = "",
  remove_previous_file = TRUE,
  print_process_time = FALSE
)

Arguments

`list_params`	a list of valid parameters
`path_output`	a character string specifying the file path where the process-logs (or output in generally) should be saved
`MilliSecs`	an integer specifying the delay in milliseconds when printing the results to the specified path_output
`path_input`	a character string specifying the path to the input data file
`remove_previous_file`	a boolean. If TRUE, in case that the path_output is not an empty string (""), then an existing file with the same output name will be removed
`print_process_time`	a boolean. If TRUE then the processing time of the function will be printed out in the R session

Details

This function allows the user to run the various methods included in the fasttext library from within R

The "output" parameter which exists in the named list (see examples section) and is passed to the "list_params" parameter of the "fasttext_interface()" function, is a file path and not a directory name and will actually return two files (a *.vec* and a *.bin*) to the output directory.

Value

a vector of class character that includes the parameters and file paths used as input to the function

References

https://github.com/facebookresearch/fastText

https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md

Examples


## Not run: 

library(fastText)


####################################################################################
# If the user intends to run the following examples then he / she must replace     #
# the 'input', 'output', 'path_input', 'path_output', 'model' and 'test_data' file #
# paths depending on where the data are located or should be saved!                #
# ( 'tempdir()' is used here as an example folder )                                #
####################################################################################


# ------------------------------------------------
# print information for the Usage of each function [ parameters ]
# ------------------------------------------------

fastText::printUsage()
fastText::printTestUsage()
fastText::printTestLabelUsage()
fastText::printQuantizeUsage()
fastText::printPrintWordVectorsUsage()
fastText::printPrintSentenceVectorsUsage()
fastText::printPrintNgramsUsage()
fastText::printPredictUsage()
fastText::printNNUsage()
fastText::printDumpUsage()
fastText::printAnalogiesUsage()
fastText::print_parameters(command = "supervised")

# -----------------------------------------------------------------------
# In case that the 'command' is one of 'cbow', 'skipgram' or 'supervised'
# -----------------------------------------------------------------------

list_params = list(command = 'cbow',
                   lr = 0.1,
                   dim = 200,
                   input = file.path(tempdir(), "doc.txt"),
                   output = tempdir(),
                   verbose = 2,
                   thread = 1)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(),"model_logs.txt"),
                         MilliSecs = 100)


# ---------------------
# 'supervised' training
# ---------------------

list_params = list(command = 'supervised',
                    lr = 0.1,
                    dim = 200,
                    input = file.path(tempdir(), "cooking.train"),
                    output = file.path(tempdir(), "model_cooking"),
                    verbose = 2,
                    thread = 1)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), 'logs_supervise.txt'),
                         MilliSecs = 5)

# ---------------------------------------
# In case that the 'command' is 'predict'
# ---------------------------------------

list_params = list(command = 'predict',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 1,
                   th = 0.0)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), 'predict_valid.txt'))


# ------------------------------------
# In case that the 'command' is 'test'  [ k = 5 , means that precision and recall are at 5 ]
# ------------------------------------

list_params = list(command = 'test',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 5,
                   th = 0.0)

res = fasttext_interface(list_params)   # It only prints 'Precision', 'Recall' to the R session


# ------------------------------------------
# In case that the 'command' is 'test-label'   [ k = 5 , means that precision and recall are at 5 ]
# ------------------------------------------

list_params = list(command = 'test-label',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 5,
                   th = 0.0)

res = fasttext_interface(list_params,              # prints also 'Precision', 'Recall' to R session
                         path_output = file.path(tempdir(), "test_valid.txt"))

# -----------------
# quantize function  [ it will take a .bin file and return an .ftz file ]
# -----------------

# the quantize function is currenlty (01/02/2019) single-threaded
# https://github.com/facebookresearch/fastText/issues/353#issuecomment-342501742

list_params = list(command = 'quantize',
                   input = file.path(tempdir(), 'model_cooking.bin'),
                   output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin')))

res = fasttext_interface(list_params)


# -----------------
# quantize function  [ by using the optional parameters 'qnorm' and 'qout' ]
# -----------------

list_params = list(command = 'quantize',
                   input = file.path(tempdir(), 'model_cooking.bin'),
                   output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin')),
                   qnorm = TRUE,
                   qout = TRUE)

res = fasttext_interface(list_params)


# ------------------
# print-word-vectors   [ each line of the 'queries.txt' must be a single word ]
# ------------------

list_params = list(command = 'print-word-vectors',
                   model = file.path(tempdir(), 'model_cooking.bin'))

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'queries.txt'),
                         path_output = file.path(tempdir(), 'print_vecs_file.txt'))


# ----------------------
# print-sentence-vectors   [ See also the comments in the main.cc file about the input-file ]
# ----------------------

list_params = list(command = 'print-sentence-vectors',
                   model = file.path(tempdir(), 'model_cooking.bin'))

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'text.txt'),
                         path_output = file.path(tempdir(), 'SENTENCE_VECs.txt'))


# ------------
# print-ngrams       [ print to console or to output-file ]
# ------------

list_params = list(command = 'skipgram', lr = 0.1, dim = 200,
                   input = file.path(tempdir(), "doc.txt"),
                   output = tempdir(), verbose = 2, thread = 1,
                   minn = 2, maxn = 2)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "ngram_out.txt"),
                         MilliSecs = 5)

list_params = list(command = 'print-ngrams',
                   model = file.path(tempdir(), 'ngram_out.bin'),
                   word = 'word')                           # print n-grams for specific word

res = fasttext_interface(list_params, path_output = "")             # print output to console
res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "NGRAMS.txt"))   # output to file


# -------------
# 'nn' function
# -------------

list_params = list(command = 'nn',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   k = 20,
                   query_word = 'word')          # a 'query_word' is required

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "nn_output.txt"))


# ---------
# analogies   [ in the output file each analogy-triplet-result is separated with a newline ]
# ---------

list_params = list(command = 'analogies',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   k = 5)

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'analogy_queries.txt'),
                         path_output = file.path(tempdir(), 'analogies_output.txt'))

# -------------
# dump function  [ the 'option' param should be one of 'args', 'dict', 'input' or 'output' ]
# -------------

list_params = list(command = 'dump',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   option = 'args')

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "DUMP.txt"))


## End(Not run)

[Package fastText version 1.0.4 Index]