hw_3.Rmd

Задание 3.1

library("tidyverse")

## -- Attaching packages -------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

data_all <- jsonlite::read_json("C:/R/HW_03/data/fpl_data_2018_2019.json")
data_all %>% 
  map("Goals") %>% 
  unlist() %>% 
  enframe() %>% 
  mutate(goals = as.numeric(value)) %>% 
  select(name, goals) -> data_goals

data_all %>% 
  map("Club") %>% 
  unlist() %>% 
  enframe() %>% 
  mutate(club = value) %>% 
  select(name, club) -> data_clubs

full_join(data_clubs, data_goals) %>% 
  arrange(desc(goals)) %>% 
  slice(1:30) -> data_end

## Joining, by = "name"

data_end %>% 
  ggplot(aes(fct_reorder(name,goals), goals, fill = club)) + # fill раскрашивает на основе переменной
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = "",
       title = "Number of Goals",
       caption = "data from www.premierleague.com")

Задание 3.2

library(tidyverse)
library(gutenbergr)

gutenberg_metadata %>% 
  filter(str_detect(title,"Женское международное движение: Сборник статей"))

## # A tibble: 1 x 8
##   gutenberg_id title author gutenberg_autho~ language gutenberg_books~
##          <int> <chr> <chr>             <int> <chr>    <chr>           
## 1        37196 Женс~ Vario~              116 ru       <NA>            
## # ... with 2 more variables: rights <chr>, has_text <lgl>

book <- gutenberg_download(37196)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

stop_rus_wors <- read_csv("C:/R/HW_03/data/stopwords-ru.csv")

## Parsed with column specification:
## cols(
##   word = col_character()
## )

library(tidytext)
book %>% 
  unnest_tokens(word, text) %>% 
  count(word, sort = TRUE) %>% 
  anti_join(stop_rus_wors) %>% 
  slice(1:20) -> words_end

## Joining, by = "word"

words_end %>% 
  ggplot(aes(fct_reorder(word, n), n)) +
  geom_col() +
  coord_flip() +
  labs(x = "",
       y = "",
       title = "'Женское международное движение: Сборник статей': частотные слова",
       caption = "source www.gutenberg.org")

Задание 3.3

library(tidyverse)
library(gutenbergr)
library(tidytext)
library(udpipe)

gutenberg_metadata %>% 
  filter(str_detect(title,"Красавице, которая нюхала табак"))

## # A tibble: 1 x 8
##   gutenberg_id title author gutenberg_autho~ language gutenberg_books~
##          <int> <chr> <chr>             <int> <chr>    <chr>           
## 1         5316 Крас~ Pushk~             1457 ru       <NA>            
## # ... with 2 more variables: rights <chr>, has_text <lgl>

book_1 <- gutenberg_download(5316)

rumodel <- udpipe_download_model(language = "russian-syntagrus")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.4/master/inst/udpipe-ud-2.4-190531/russian-syntagrus-ud-2.4-190531.udpipe to C:/R/HW_03/russian-syntagrus-ud-2.4-190531.udpipe

## Visit https://github.com/jwijffels/udpipe.models.ud.2.4 for model license details

str_c(book_1$text, collapse = " ") -> long_line 

udpipe(long_line, object = rumodel) -> book_data

book_data %>% 
  mutate(upos = str_replace_all(upos, "DET", "PART")) %>%
  mutate(upos = case_when( 
    str_detect(lemma, "быть") ~ "VERB",
    str_detect(lemma, "бы") ~ "PART",
    TRUE ~ upos)) %>% 
  count(upos, sort = TRUE) %>% 
  ggplot(aes(fct_reorder(upos, n), n)) +
  geom_col() +
  labs(x = "",
       y = "",
       title = "Красавице, которая нюхала табак")

hw_3.Rmd

Александр Дюльденко

18 12 2019

Задание 3.1

Задание 3.2

Задание 3.3