library(spacyr)
txt <- c(doc1 = "Natural Language Processing is a branch of computer science that employs various Artificial Intelligence (AI) techniques to process content written in natural language. NLP-enhanced wikis can support users in finding, developing and organizing knowledge contained inside the wiki repository. ",
doc2 = "Paul earned a postgraduate degree from MIT.")
(tmp <- spacy_parse(txt, nounphrase = TRUE))
## Found 'spacy_condaenv'. spacyr will use this environment
## successfully initialized (spaCy Version: 2.0.16, language model: en)
## (python options: type = "condaenv", value = "spacy_condaenv")
## doc_id sentence_id token_id token lemma pos entity
## 1 doc1 1 1 Natural natural PROPN ORG_B
## 2 doc1 1 2 Language language PROPN ORG_I
## 3 doc1 1 3 Processing processing PROPN ORG_I
## 4 doc1 1 4 is be VERB
## 5 doc1 1 5 a a DET
## 6 doc1 1 6 branch branch NOUN
## 7 doc1 1 7 of of ADP
## 8 doc1 1 8 computer computer NOUN
## 9 doc1 1 9 science science NOUN
## 10 doc1 1 10 that that ADJ
## 11 doc1 1 11 employs employ VERB
## 12 doc1 1 12 various various ADJ
## 13 doc1 1 13 Artificial artificial PROPN ORG_B
## 14 doc1 1 14 Intelligence intelligence PROPN ORG_I
## 15 doc1 1 15 ( ( PUNCT ORG_I
## 16 doc1 1 16 AI ai PROPN ORG_I
## 17 doc1 1 17 ) ) PUNCT
## 18 doc1 1 18 techniques technique NOUN
## 19 doc1 1 19 to to PART
## 20 doc1 1 20 process process VERB
## 21 doc1 1 21 content content NOUN
## 22 doc1 1 22 written write VERB
## 23 doc1 1 23 in in ADP
## 24 doc1 1 24 natural natural ADJ
## 25 doc1 1 25 language language NOUN
## 26 doc1 1 26 . . PUNCT
## 27 doc1 2 1 NLP nlp PROPN ORG_B
## 28 doc1 2 2 - - PUNCT
## 29 doc1 2 3 enhanced enhance VERB
## 30 doc1 2 4 wikis wiki NOUN
## 31 doc1 2 5 can can VERB
## 32 doc1 2 6 support support VERB
## 33 doc1 2 7 users user NOUN
## 34 doc1 2 8 in in ADP
## 35 doc1 2 9 finding find VERB
## 36 doc1 2 10 , , PUNCT
## 37 doc1 2 11 developing develop VERB
## 38 doc1 2 12 and and CCONJ
## 39 doc1 2 13 organizing organize VERB
## 40 doc1 2 14 knowledge knowledge NOUN
## 41 doc1 2 15 contained contain VERB
## 42 doc1 2 16 inside inside ADP
## 43 doc1 2 17 the the DET
## 44 doc1 2 18 wiki wiki NOUN
## 45 doc1 2 19 repository repository NOUN
## 46 doc1 2 20 . . PUNCT
## 47 doc2 1 1 Paul paul PROPN ORG_B
## 48 doc2 1 2 earned earn VERB
## 49 doc2 1 3 a a DET
## 50 doc2 1 4 postgraduate postgraduate NOUN
## 51 doc2 1 5 degree degree NOUN
## 52 doc2 1 6 from from ADP
## 53 doc2 1 7 MIT mit PROPN ORG_B
## 54 doc2 1 8 . . PUNCT
## nounphrase whitespace
## 1 beg TRUE
## 2 mid TRUE
## 3 end_root TRUE
## 4 TRUE
## 5 beg TRUE
## 6 end_root TRUE
## 7 TRUE
## 8 beg TRUE
## 9 end_root TRUE
## 10 TRUE
## 11 TRUE
## 12 beg TRUE
## 13 mid TRUE
## 14 mid TRUE
## 15 mid FALSE
## 16 mid FALSE
## 17 mid TRUE
## 18 end_root TRUE
## 19 TRUE
## 20 TRUE
## 21 beg_root TRUE
## 22 TRUE
## 23 TRUE
## 24 beg TRUE
## 25 end_root FALSE
## 26 TRUE
## 27 beg FALSE
## 28 mid FALSE
## 29 mid TRUE
## 30 end_root TRUE
## 31 TRUE
## 32 TRUE
## 33 beg_root TRUE
## 34 TRUE
## 35 FALSE
## 36 TRUE
## 37 TRUE
## 38 TRUE
## 39 TRUE
## 40 beg_root TRUE
## 41 TRUE
## 42 TRUE
## 43 beg TRUE
## 44 mid TRUE
## 45 end_root FALSE
## 46 TRUE
## 47 beg_root TRUE
## 48 TRUE
## 49 beg TRUE
## 50 mid TRUE
## 51 end_root TRUE
## 52 TRUE
## 53 beg_root FALSE
## 54 FALSE
nounphrase_extract(tmp)
## doc_id sentence_id nounphrase
## 1 doc1 1 Natural Language Processing
## 2 doc1 1 a branch
## 3 doc1 1 computer science
## 4 doc1 1 various Artificial Intelligence (AI) techniques
## 5 doc1 1 content
## 6 doc1 1 natural language
## 7 doc1 2 NLP-enhanced wikis
## 8 doc1 2 users
## 9 doc1 2 knowledge
## 10 doc1 2 the wiki repository
## 11 doc2 1 Paul
## 12 doc2 1 a postgraduate degree
## 13 doc2 1 MIT
## root_token
## 1 Processing
## 2 branch
## 3 science
## 4 techniques
## 5 content
## 6 language
## 7 wikis
## 8 users
## 9 knowledge
## 10 repository
## 11 Paul
## 12 degree
## 13 MIT
nounphrase_consolidate(tmp)
## doc_id sentence_id token_id
## 1 doc1 1 1
## 2 doc1 1 2
## 3 doc1 1 3
## 4 doc1 1 4
## 5 doc1 1 5
## 6 doc1 1 6
## 7 doc1 1 7
## 8 doc1 1 8
## 9 doc1 1 9
## 10 doc1 1 10
## 11 doc1 1 11
## 12 doc1 1 12
## 13 doc1 1 13
## 14 doc1 1 14
## 15 doc1 1 15
## 16 doc1 2 1
## 17 doc1 2 2
## 18 doc1 2 3
## 19 doc1 2 4
## 20 doc1 2 5
## 21 doc1 2 6
## 22 doc1 2 7
## 23 doc1 2 8
## 24 doc1 2 9
## 25 doc1 2 10
## 26 doc1 2 11
## 27 doc1 2 12
## 28 doc1 2 13
## 29 doc1 2 14
## 30 doc1 2 15
## 31 doc2 1 1
## 32 doc2 1 2
## 33 doc2 1 3
## 34 doc2 1 4
## 35 doc2 1 5
## 36 doc2 1 6
## token
## 1 Natural_Language_Processing
## 2 is
## 3 a_branch
## 4 of
## 5 computer_science
## 6 that
## 7 employs
## 8 various_Artificial_Intelligence_(AI)_techniques
## 9 to
## 10 process
## 11 content
## 12 written
## 13 in
## 14 natural_language
## 15 .
## 16 NLP-enhanced_wikis
## 17 can
## 18 support
## 19 users
## 20 in
## 21 finding
## 22 ,
## 23 developing
## 24 and
## 25 organizing
## 26 knowledge
## 27 contained
## 28 inside
## 29 the_wiki_repository
## 30 .
## 31 Paul
## 32 earned
## 33 a_postgraduate_degree
## 34 from
## 35 MIT
## 36 .
## lemma pos entity
## 1 natural_language_processing nounphrase ORG_B
## 2 be VERB
## 3 a_branch nounphrase
## 4 of ADP
## 5 computer_science nounphrase
## 6 that ADJ
## 7 employ VERB
## 8 various_artificial_intelligence_(ai)_technique nounphrase
## 9 to PART
## 10 process VERB
## 11 content nounphrase
## 12 write VERB
## 13 in ADP
## 14 natural_language nounphrase
## 15 . PUNCT
## 16 nlp-enhance_wiki nounphrase ORG_B
## 17 can VERB
## 18 support VERB
## 19 user nounphrase
## 20 in ADP
## 21 find VERB
## 22 , PUNCT
## 23 develop VERB
## 24 and CCONJ
## 25 organize VERB
## 26 knowledge nounphrase
## 27 contain VERB
## 28 inside ADP
## 29 the_wiki_repository nounphrase
## 30 . PUNCT
## 31 paul nounphrase ORG_B
## 32 earn VERB
## 33 a_postgraduate_degree nounphrase
## 34 from ADP
## 35 mit nounphrase ORG_B
## 36 . PUNCT