# Solutions
## Exercise 1
### Tom
```python
import numpy as np
def weighted_cosine(a: np.ndarray, b: np.ndarray, weights: np.ndarray = None):
"""Compute the weighted angle between vectors a and b, given the weights of their
values."""
if weights is None:
weights = np.array([1] * len(a))
a = a * weights
b = b * weights
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
vectors = np.array([
[3.06, 500, 6],
[2.68, 320, 4],
[2.92, 640, 6],
])
for part, weights in zip(
["a", "b", "c"],
[np.array([1, 1, 1]),
np.array([1, 0.01, 0.5]),
np.array([1/np.average(vectors[:,i]) for i in range(len(vectors))])]
):
print(f"=== Part {part} ===")
print(f"Weights: {weights}")
for i in range(len(vectors)):
for j in range(i + 1, len(vectors)):
angle = weighted_cosine(vectors[i], vectors[j], weights)
print(f"Angle between vectors {i} and {j}: {angle}")
print()
```
## Exercise 2
- valuable data: other people's preferences
- collaborative-based:
1. like a random item
2. keep liking other items until suggestions run out (if the system is built poorly)
3. now you maybe have a profile of a user, but most likely an average of profiles of other users
- use this to populate his website if he's just starting out
- even if you don't have a website, gives you ares of user interest
- content-based:
- you could do the same thing to create user profiles
- it's not gonna tell you what users on the website like
- you could try to guess the feature vectors and see if items you recommend are similar to the other system
- impact:
- collaborative-based: yes, we're creating a bunch of fake users
- content-based: not so much if it's purely content-based
## Exercise 3
### Tom
```python
import numpy as np
def cosine_distance(a: np.ndarray, b: np.ndarray):
"""Compute the weighted angle between vectors a and b."""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def jaccard_similarity(a: np.ndarray, b: np.ndarray):
"""Compute the Jaccard similarity between two vectors (as boolean)."""
a_set = set([i for i, v in enumerate(a) if v != 0])
b_set = set([i for i, v in enumerate(b) if v != 0])
return len(a_set & b_set) / len(a_set | b_set)
# rows are users, columns their ratings of items
table = np.array([
[4, 5, 0, 5, 1, 0, 3, 2],
[0, 3, 4, 3, 1, 2, 1, 0],
[2, 0, 1, 3, 0, 4, 5, 3],
])
print("=== Part a) ===")
for i in range(len(table)):
for j in range(i + 1, len(table)):
a = table[i]
b = table[j]
print(f"Jaccard similarity of {a} and {b}: {jaccard_similarity(a, b)}")
print()
print("=== Part b) ===")
for i in range(len(table)):
for j in range(i + 1, len(table)):
a = table[i]
b = table[j]
print(f"Cosine distance of {a} and {b}: {cosine_distance(a, b)}")
print()
print("=== Part c) ===")
for i in range(len(table)):
for j in range(i + 1, len(table)):
a = np.array([0 if v <= 2 else 1 for v in table[i]])
b = np.array([0 if v <= 2 else 1 for v in table[j]])
print(f"Jaccard similarity of {a} and {b}: {jaccard_similarity(a, b)}")
print()
print("=== Part d) ===")
for i in range(len(table)):
for j in range(i + 1, len(table)):
a = np.array([0 if v <= 2 else 1 for v in table[i]])
b = np.array([0 if v <= 2 else 1 for v in table[j]])
print(f"Cosine distance of {a} and {b}: {cosine_distance(a, b)}")
print()
print("=== Part e) ===")
normalized_table = np.array([row - np.average(row) for row in table])
print(normalized_table)
print()
print("=== Part f) ===")
for i in range(len(table)):
for j in range(i + 1, len(table)):
a = normalized_table[i]
b = normalized_table[j]
print(f"Cosine distance of {a} and {b}: {cosine_distance(a, b)}")
print()
```
## Exercise 4
a) two utility matrices:
- **students x professors**
- value is an integer from 1 (worst) to 5 (best)
- **students x events**
- value is a string commenting the event and also a boolean would/wouldn't recommend
- (not-)recommend event/professor based on (dis-)liking the other
b) artists x artworks
- **value is an integer from 1 (worst) to 5 (best)**
- artists: number of views of their portfolio
- recommend artists depending on whether you like more or less popular ones
- artworks: category (portrait/landscape/etc.)
- recommend artworks based on categories you liked
c) users^2
- **boolean matrix (like/don't like)**
- profile contains information
- age, gender, height
- interests
- location, language
- showing similar people based on those messaged / not showing similar to those blocked
## Exercise 5
### Tom
```python
from pyspark.context import SparkContext
from pyspark.mllib.linalg.distributed import CoordinateMatrix, IndexedRow, IndexedRowMatrix
from pyspark.mllib.linalg import SparseVector
from pyspark.sql import SparkSession
from typing import Union, List
import numpy as np
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()
def cosine_distance(a: np.ndarray, b: np.ndarray):
"""Compute the weighted angle between vectors a and b."""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def get_vector(matrix: IndexedRowMatrix, i):
"""Get a row with the given index from an IndexedRowMatrix."""
return matrix.rows.filter(lambda x: x.index == i).collect()[0].vector
def _pearson_correlation(a: IndexedRow, b: IndexedRow):
"""Return the correlation of two indexed rows."""
# get only non-zero columns
non_zero_columns = np.intersect1d(a.indices, b.indices)
# if they share no components, they are independent
if len(non_zero_columns) == 0:
return 0
non_zero_vectors = np.array([
[a[int(i)] for i in non_zero_columns],
[b[int(i)] for i in non_zero_columns],
])
# center vectors by average
centered_vector = np.array([row - np.average(row) for row in non_zero_vectors])
return cosine_distance(*centered_vector)
def pearson_correlation(matrix: IndexedRowMatrix, a: int, b: int):
"""Return the correlation of two indexed rows."""
return _pearson_correlation(get_vector(matrix, a), get_vector(matrix, b))
def k_nearest_neighbours(matrix: IndexedRowMatrix, a: int, k: int) -> List[int]:
"""Return the k nearest indexes of neighbours of the row a."""
a = get_vector(matrix, a)
return matrix.rows\
.sortBy(lambda x: _pearson_correlation(a, x.vector), ascending=False)\
.map(lambda x: x.index)\
.take(k + 1)[1:] # don't include itself
print("=== Part a) ===")
artist_aliases = sc.textFile("data/artist_alias_small.txt")\
.map(lambda x: list(map(int, x.split("\t"))))\
.collectAsMap()
# users with fixed bad artist IDs
# the groupBy and mapping is to add listens for same artists after the fix
users = sc.textFile("data/user_artist_data_small.txt")\
.map(lambda x: list(map(int, x.split())))\
.map(lambda x: (x[0],
x[1] if x[1] not in artist_aliases else artist_aliases[x[1]],
x[2]))\
.groupBy(lambda x: (x[0], x[1]))\
.map(lambda x: list(x[1]))\
.map(lambda x: (x[0][0], x[0][1], sum([y[2] for y in x])))
# the artist data contains some spaces for separation!!!
# that's why we're using maxsplit to split only once
artists_dictionary = sc.textFile("data/artist_data_small.txt")\
.map(lambda x: x.split(maxsplit=1))\
.map(lambda x: (x[1], int(x[0])))\
.collectAsMap()
# store the users as an indexed row matrix to not waste memory
utility_matrix = CoordinateMatrix(users)\
.toIndexedRowMatrix()
print("Utility matrix populated.")
print()
print("=== Part b) ===")
print("See pearson_correlation(matrix, a, b)")
i, j = 1059637, 2064012
print(f"Distance of {i} and {j}: {pearson_correlation(utility_matrix, i, j)}")
print()
print("=== Part c) ===")
print("See k_nearest_neighbours(matrix, a, k)")
i, k = 1059637, 5
print(f"{k} nearest neighbours of {i}: {k_nearest_neighbours(utility_matrix, i, k)}")
print()
print("=== Part d) ===")
U = IndexedRow(123456789, SparseVector(
5,
[
(artists_dictionary["Metallica"], 10.0),
(artists_dictionary["Pink Floyd"], 4.0),
(artists_dictionary["Black Sabbath"], 2.0),
(artists_dictionary["Slayer"], 3.0),
(artists_dictionary["Bon Jovi"], 1.0),
])
)
print(f"User count: {utility_matrix.rows.count()}")
# a little ugly but there isn't a straightforward way to add a new row to the RDD
utility_matrix = IndexedRowMatrix(utility_matrix.rows.union(sc.parallelize([U])))
print(f"User count after adding U: {utility_matrix.rows.count()}")
print()
```