---
tags: tekstiaine
---
# Tunni jooksul tehtud märkmed (SVUH.00.093 22.04.2023)
Koondan siia tunni jooksul tehtud märkmed kergemaks kopeerimiseks
```
Siia saab
koodi kirjutada
```

Peetri näidis

Triin

Roosi

Hanna

Marleen

Elika

Ingeliis

Carmen

Vivian
```
asukohad %>%
filter(str_detect(word,"peremees|peremehe")) %>%
mutate(type=str_extract(word,"pereme")) %>%
ggplot(aes(x=asukoht,y=type))+
geom_point()
asukohad %>%
filter(str_detect(word,"^so[oh]")) %>%
mutate(type=str_extract(word,"pereme")) %>%
ggplot(aes(x=asukoht,y=type))+
geom_point()
```
### viisgrammid
```
viis_asukohad <- viisgrammid %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
ungroup()
viis_asukohad %>%
filter(fivegram=="vurr vurr vurr vurr vurr"|fivegram=="nõnda kestis see tükk aega") %>%
ggplot(aes(x=asukoht,y=fivegram))+
geom_point()
asukohad2 <- viisgrammid %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
ungroup() %>%
top_n(10)
viisgrammid %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
filter(asukoht <= 10) %>%
ggplot(aes(x=asukoht,y=fivegram))+
geom_point()
```

```
asukohad_valik <- viisgrammid %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
group_by(fivegram) %>%
mutate(word_n=n()) %>%
ungroup() %>%
filter(word_n>3)
asukohad_valik %>%
ggplot(aes(x=asukoht,y=fivegram))+
geom_point()
```

```
peatykid_sonad2 <- raamat1 %>%
group_by(chapter) %>%
unnest_tokens(word,txt,to_lower=F) %>%
count(word,sort=T) %>%
mutate(sagedus=n()/sum(n))
peatykid_sonad2 %>%
anti_join(stopwords,"word") %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
mutate(row_number=row_number()) %>%
filter(row_number<11) %>%
filter(chapter<11) %>%
ggplot(aes(x=chapter,y=row_number,label=word))+
geom_label()
tf_idfs_peatykid %>%
mutate(row_number=row_number()) %>%
filter(row_number<11) %>%
filter(chapter>= 10 ) %>%
filter(chapter < 21) %>%
ggplot(aes(x=chapter,y=row_number,label=word))+
geom_label()
```
```
words %>%
count(filename) %>%
ggplot(aes(y=filename,x=n))+
geom_col()
```
```
words %>%
count(filename) %>%
mutate(sõnad_raamatus=n)
```
```
words %>%
count(word, sort=T) %>%
mutate(nr=row_number()) %>%
filter(nr<11) %>%
ggplot(aes(y=word,x=n))+
geom_col()
```
```
words %>%
group_by(filename) %>%
count(word, sort=T)
```
```
words %>%
filter(filename=="Elisabeth_Aspe_Ennosaare_Ain.utf8") %>%
count(word, sort=T) %>%
mutate(nr=row_number()) %>%
filter(nr<11) %>%
ggplot(aes(y=word,x=n))+
geom_col()
```
```
words %>%
group_by(filename) %>%
count(word, sort=T) %>%
mutate(nr=row_number()) %>%
filter(nr<11)
```
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n))%>%
filter(word == "tüdruk") %>%
ggplot(aes(y=filename,x=freq))+
geom_point()+
guides(color=F)
```
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n))%>%
filter(word=="tüdruk") %>%
ggplot(aes(y=filename,x=freq))+
geom_point()+
guides(color=F)
```
```
words %>%
group_by(filename) %>%
filter(str_detect(filename,"^Juri")) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n)) %>%
ungroup() %>%
mutate(searchterm=str_extract(word,"^kirik|^kõrts")) %>%
#filter(searchterm=="kirik"|searchterm=="kõrts")+
filter(!is.na(searchterm)) %>% # kasutame ainult ridu, kus on olemas info searchterm tulbas
group_by(filename,searchterm) %>%
summarise(freq=sum(freq)) %>% # tekstid on grupeeritud failinime kaupa
ggplot(aes(y=filename,x=freq,color=searchterm))+
geom_point(alpha=0.5)
```
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
filter(str_detect(word,"w"))
keywords2 <- texts %>%
unnest_tokens(word,txt,to_lower=F) %>%
mutate(word=str_replace_all(word,"w","v")) %>%
anti_join(stopwords,"word") %>%
group_by(filename) %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
count(word,sort=T) %>%
bind_tf_idf(word,filename,n) %>%
arrange(desc(tf_idf)) %>%
filter(filename=="Juhan_Liiv_Noia_tutar.utf8") %>%
head(10)
keywords2
```
```
keywords <- texts %>%
unnest_tokens(word,txt,to_lower=F) %>%
mutate(word=str_replace_all(word,"w","v")) %>%
#anti_join(stopwords,"word") %>%
group_by(filename) %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
count(word,sort=T) %>%
bind_tf_idf(word,filename,n) %>%
filter(str_detect(filename,"Vambola")) %>%
arrange(desc(tf_idf)) %>%
head(10)
```
```
filename word n tf idf tf_idf
<chr> <chr> <int> <dbl> <dbl> <dbl>
1 Andres_Saal_Vambola.utf8 sõjamees 81 0.00120 1.61 0.00193
2 Andres_Saal_Vambola.utf8 vanake 64 0.000948 1.61 0.00153
3 Andres_Saal_Vambola.utf8 vaenlased 63 0.000933 1.46 0.00136
4 Andres_Saal_Vambola.utf8 neiuke 25 0.000370 3.40 0.00126
5 Andres_Saal_Vambola.utf8 vanema 80 0.00119 1.00 0.00119
6 Andres_Saal_Vambola.utf8 vaenlaste 65 0.000963 1.20 0.00116
7 Andres_Saal_Vambola.utf8 kalamees 32 0.000474 2.30 0.00109
8 Andres_Saal_Vambola.utf8 malev 21 0.000311 3.40 0.00106
9 Andres_Saal_Vambola.utf8 sõbranna 29 0.000430 2.30 0.000989
10 Andres_Saal_Vambola.utf8 sõjamehe 33 0.000489 2.01 0.000985
```