---
tags: tekstiaine
---
# Tunni jooksul tehtud märkmed (SVUH.00.093 tekstid2)
Koondan siia tunni jooksul tehtud märkmed kergemaks kopeerimiseks
```
Siia saab koodi kirjutada
```
```
tekst_sonad <- raamat1_sonad %>%
count(word,sort=T) %>%
mutate(sagedus=n/sum(n))
```
```
asukohad <- raamat1_sonad %>%
mutate(nr=row_number(), n=n()) %>%
mutate(asukoht=nr/(n+1)) %>%
ungroup()
```
```
"meie oleme siin" %>% str_detect("me")
"meie oleme siin" %>% str_detect("te")
"meie oleme siin" %>% str_extract("me")
"meie oleme siin" %>% str_extract("te")
"meie oleme siin" %>% str_extract_all("me")
```
```
"meie oleme siin" %>% str_extract("^me")
"meie oleme siin" %>% str_extract("\\bme")
"meie oleme siin" %>% str_extract("me\\b")
"meie oleme siin" %>% str_extract("mei\\b")
```
## Näidispildid

## Isetehtud pildid.
Lisa ctrl+v, siis ta laeb kopeeritud pildi üles ja näitab seda teistelegi






## Rahade kohta
Kolmesõnaline otsing
```
rahad <- raamat1 %>%
mutate(leiud=str_extract_all(txt,"[a-zõäüöA-ZÕÄÖÜ]+ [a-zõäüöA-ZÕÄÖÜ]+ rubla"))
#Meil tekivad tabelisse loendid. Me saame neid lahutada käsuga unnest() valides kõigepealt ainult leidude tulba.
rahad %>%
select(leiud) %>%
unnest()
```
bigrammidega
```
bigrammid %>%
filter(str_detect(bigram,"rubla$"))
```
Viisgrammide kuvamine
```
top5grams <- viisgrammid %>%
count(fivegram,sort=T) %>%
filter(n>3)
viisgrammid %>%
mutate(row_number=row_number(),n=n()) %>%
mutate(asukoht=row_number/(n+1)) %>%
group_by(fivegram) %>%
mutate(mitu=n()) %>%
filter(mitu>3) %>%
ggplot(aes(x=asukoht,y=fivegram)) +
geom_point()
```
## Stopsõnad
datadoi link https://datadoi.ee/handle/33/78

Suurtähtede kõrvalejätmiseks täisloendist.
```
raamat1_sonad2 %>%
group_by(chapter) %>%
count(word,sort=T) %>%
anti_join(stopwords,"word") %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
mutate(row_number=row_number()) %>%
filter(row_number<11) %>%
filter(chapter<11) %>%
ggplot(aes(x=chapter,y=row_number,label=word))+
geom_label()
```
## tf-idf

Peatükid 1-10

Peatükid 11-20

## Mitu teksti korraga
```
texts %>%
filter(str_detect(filename,"Tammsaare")) %>%
count(filename)
```
```
texts %>%
count(filename) %>%
ggplot(aes(y=filename,x=n))+
geom_point()
```
```
words %>%
count(filename)
```
```
words %>%
count(word,sort=T)
words %>%
count(word,sort=T) %>%
mutate(row_number=row_number()) %>%
filter(row_number<11) %>%
ggplot(aes(x=word,y=n))+
geom_col()
words %>%
group_by(filename) %>%
count(word,sort=T)
```

```
words %>%
#filter(str_detect(filename,"Ennosaare")) %>%
group_by(filename) %>%
count(word,sort=T) %>%
filter(str_detect(filename,"Ennosaare")) %>%
filter(row_number()<11)
```
ja igast tekstist eraldi
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
filter(row_number()<11)
```
## Veel graafikuid.
```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n))%>%
filter(word=="hobune") %>% #sort pani need juba õigessejärjekorda
ggplot(aes(y=filename,x=freq,color=filename))+
geom_point()+
guides(color=F)
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n))%>%
filter(word=="hobune") %>% #sort pani need juba õigessejärjekorda
ggplot(aes(y=word,x=freq,color=filename))+
geom_point()+
guides(color=F)
```




```
words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n)) %>%
filter(str_detect(word,"^töö")) %>%
summarise(freq=sum(freq)) %>% # tekstid on grupeeritud failinime kaupa
ggplot(aes(y=filename,x=freq))+
geom_point()
```






## Eristavad sõnad
```
keywords <- texts %>%
unnest_tokens(word,txt,to_lower=F) %>%
mutate(word=str_replace_all(word,"w","v")) %>%
#anti_join(stopwords,"word") %>%
group_by(filename) %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
count(word,sort=T) %>%
bind_tf_idf(word,filename,n) %>%
arrange(desc(tf_idf))# %>%
#head(100)
keywords %>%
filter(str_detect(filename,"vabrik"))
```
```
keywords <- texts %>%
unnest_tokens(word,txt,to_lower=F) %>%
mutate(word=str_replace_all(word,"w","v")) %>%
#anti_join(stopwords,"word") %>%
group_by(filename) %>%
filter(!str_detect(word,"[A-ZÕÄÖÜ]")) %>%
count(word,sort=T) %>%
bind_tf_idf(word,filename,n) %>%
arrange(desc(tf_idf)) %>%
filter(str_detect(filename,"vabrik")) %>%
head(100)
```
```
tabel <- words %>%
group_by(filename) %>%
count(word,sort=T) %>%
mutate(freq=n/sum(n)) %>%
mutate(searchterm=str_extract(word,"^mets|^maa|^linn")) %>%
filter(!is.na(searchterm)) %>% # kasutame ainult ridu, kus on olemas info searchterm tulbas
group_by(filename,searchterm) %>%
summarise(freq=sum(freq))
tabel %>%
write_tsv("output/mets_maa_linn.tsv")
write_tsv(tabel, "output/mets_maa_linn.tsv")
```