---
tags: tekstiaine
---
# Jagatud märkmed (tekstitöötlus 2022)
## 13.04.2021
Arvutiklassis installiks
```
install.packages('rlang')
install.packages('tidyverse')
install.packages('tidytext')
```
Control + Shift + M korraga teeb %>% toru.
```
gapminder %>%
filter(country=="France")
gapminder %>%
filter(!country=="Germany")
gapminder %>%
filter(country!="Germany")
```


```
brasiilia <- gapminder %>%
filter(country=="Brazil")
ameerika <- gapminder %>%
filter(continent=="Americas")
okeaania <- gapminder %>%
filter(continent=="Oceania")
```
```
gapminder %>%
filter(country=="Finland") %>%
arrange(desc(year))
?filter
```
```
gapminder %>%
filter(continent=="Europe") %>%
filter(year==1952) %>%
arrange(country,by_name=T)
järjestus <- gapminder %>%
filter(continent=="Europe") %>%
filter(year==1952) %>%
arrange(country)
```
```
gapminder %>%
filter(continent=="Europe") %>%
filter(year==1952) %>%
select(gdpPercap, lifeExp)
gapminder %>%
mutate(nr = 1:1704)
```
```
gapminder %>%
filter(country=="Brazil") %>%
ggplot(aes(x=year,y=gdpPercap))+
geom_point()+
labs(title="Brasiilia rikkus 1952-2007")
```










## 20.04.2022
```
edetabel %>%
filter(rank %in% 6:10) %>%
filter(artist=="Smilers") %>%
select(year,song,rank)
edetabel %>%
filter(artist=="Smilers") %>%
select(year,song,rank) %>%
filter(rank %in% 6:10)
edetabel %>%
filter(artist=="Smilers" & rank %in% 6:10) %>%
select(year,song,rank)
edetabel %>%
filter(artist=="Smilers" & rank > 5 & rank < 11) %>%
select(year,song,rank)
```
edetabel %>%
filter(artist=="Maarja") %>%
count(year)
```
edetabel %>%
filter(language=="en") %>%
filter(year<2000) %>%
select(year,rank,song)
edetabel %>%
filter(language=="en") %>%
filter(year%in% 1990:1999) %>%
select(year,rank,song)
```
```
edetabel %>%
filter(rank==1) %>%
sample_n(10) %>%
select(rank,year,song)
```
```
edetabel %>%
filter(artist=="Nancy") %>%
count(year)
edetabel %>%
group_by(artist) %>%
mutate(first=min(year), last=max(year)) %>%
filter(first>2000)
```
```
edetabel %>%
group_by(artist) %>%
arrange(artist, rank) %>%
mutate(artistbest=row_number()) %>%
mutate(lastplace=max(artistbest)) %>%
filter(artistbest==lastplace) %>%
ungroup()
```
## 27.04.2022
```
stopsonad <- read_tsv("https://datadoi.ee/bitstream/handle/33/78/estonian-stopwords.txt?sequence=1&isAllowed=y",col_names = "word")
```
```
laulusonad %>%
left_join(stopsonad2,by="word") %>%
group_by(artist) %>%
filter(artist=="Nublu") %>%
count(onstopsona,word,sort=T) %>%
mutate(proportsioon= n/sum(n)) %>%
filter(onstopsona==T) %>%
arrange(desc(proportsioon))
```
```
NA==NA
is.na(NA)
is.na("NA")
```
```
ilukirj_sonad <- read_tsv("http://datadoi.ee/bitstream/handle/33/41/token_1_grams.tsv?sequence=4&isAllowed=y",col_names = c("word","n_token","n_docs"))
ilukirj_sonad <- ilukirj_sonad %>%
mutate(prop_ilukirj=n_token/sum(n_token))
```
```
laulusonad %>%
filter(language=="et") %>%
group_by(artist, song) %>%
count(word,sort=T) %>%
filter(!word %in% c("ref")) %>%
mutate(proportsioon=n/sum(n)) %>%
arrange(desc(n))
```
```
laulusonad %>%
filter(language=="et") %>%
anti_join(stopsonad,by="word") %>%
group_by(artist, song) %>%
count(word,sort=T) %>%
filter(!word %in% c("ref")) %>%
mutate(proportsioon=n/sum(n)) %>%
arrange(desc(n))
```
```
laulusonad %>%
group_by(artist,song) %>%
mutate(rownr=row_number()) %>%
mutate(n=n()) %>%
mutate(asukoht=rownr/n) %>%
filter(artist=="reket") %>%
filter(word=="ei")
```
## 04.05.2022
### Ühe artisti laulude ülevaade, graafikud:
Jam









Terminaator
### Mõne sõna asukohad mõnes loos

olla ja saada, võimalik, et liiga palju






### Regulaaravaldistest
```
"me oleme siin" %>% str_detect("me")
"me oleme siin" %>% str_detect("ma")
"me oleme siin" %>% str_detect("^me")
"me oleme siin" %>% str_detect(" me ")
"me oleme siin" %>% str_extract("me")
"me oleme siin" %>% str_extract("ma")
"me oleme siin" %>% str_extract_all("me")
"me oleme siin" %>% str_extract("me")
```



## 11.05.2022









```
top5grams <- viisgrammid %>%
count(fivegram,sort=T) %>%
mutate(rownr=row_number()) %>%
filter(rownr<11)
asukohad_viisgrammid <- viisgrammid %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
ungroup()
asukohad_viisgrammid %>%
inner_join(top5grams,by="fivegram") %>%
ggplot(aes(x=asukoht,y=fivegram))+
geom_point()
```
```
tf_idfs_peatykid %>%
mutate(row_number=row_number()) %>%
filter(row_number<11) %>%
filter(chapter>9) %>%
filter(chapter<21) %>%
ggplot(aes(x=chapter,y=row_number,label=word))+
geom_label()
```
```
words %>%
count(filename) %>%
ggplot(aes(y=filename,x=n))+
geom_col()
```
```
words %>%
count(word,sort=T)
words %>%
count(word,sort=T) %>%
filter(row_number()<11) %>%
ggplot(aes(y=word,x=n))+
geom_col()
```
```
words %>%
group_by(filename) %>%
count(word,sort=T)
```
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
filter(row_number()<11) %>%
filter(filename=="Andres_Saal_Vambola.utf8")
```
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
filter(row_number()<11)
```