Currently we have an average of 18000/20563 of messages withouth commodity detected when comparing channels with and withouth score.
This issue is one of the impactfull data points for scoring channels.
We detected some patterns that may give us a clue of detection improvements.
We use this query to filter relevant cases:
```
SELECT commodity, channel_crowd_score, entry_price_range, target_prices, message_text
FROM public.cloudburst_signals
WHERE signal_type = 'buy'
AND LENGTH(message_text) > 35
AND source_datetime < current_date - 20
AND commodity IS NULL
AND channel_crowd_score IS NULL
AND lower(message_text) NOT LIKE '%minutes%'
AND lower(message_text) NOT LIKE '%hour%'
AND lower(message_text) NOT LIKE '%hours%'
AND lower(message_text) NOT LIKE '%mins%'
AND lower(message_text) NOT LIKE '%challenge%'
ORDER BY message_text ASC -- And with DESC And withouth ORDER BY
LIMIT 1000
```
# for commodities
### The use of # before the initials of the cripto.

#### potential regex
```
# Select the rows with empty commodity list
rows_to_update = signals['commodity'].apply(lambda x: x == [])
# Extract information between space and BTC (excluding the BTC string)
commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'#(\S+?)\s', x)][0] if re.findall(r'#(\S+?)\s', x) else [])
# update the commodity column with the new values
signals.loc[rows_to_update, 'commodity'] = commodity_values
print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)]))
```
### The use of $ before the initials of the cripto.

#### potential regex
```
# Select the rows with empty commodity list
rows_to_update = signals['commodity'].apply(lambda x: x == [])
# Extract information between space and BTC (excluding the BTC string)
commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'\$(\S+)', x)][0] if re.findall(r'\$(\S+)', x) else [])
# update the commodity column with the new values
signals.loc[rows_to_update, 'commodity'] = commodity_values
print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)]))
```
### The use of # and a number before the initials of the cripto

#### potential regex
```
# Select the rows with empty commodity list
rows_to_update = signals['commodity'].apply(lambda x: x == [])
# Extract information between space and BTC (excluding the BTC string)
commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'[^#]*#([^#\s]+)', x)][0] if re.findall(r'[^#]*#([^#\s]+)', x) else [])
# update the commodity column with the new values
signals.loc[rows_to_update, 'commodity'] = commodity_values
print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)]))
```
### The use of secuencial coin initial with the pair (This may happend post a # too)




#### potential regex
Usually are all paired to USDT or BTC so:
```
### USDT ###
# Extract information between space and USDT (excluding the USDT string)
signals['commodity'] = signals['message_text'].apply(lambda x: re.findall(r' (\S+)USDT', x))
# eliminate # and / from the extracted information
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[/#]', '', i) for i in x] if len(x) == 0 else x)
# print the number of usdt_info that are not empty
print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)]))
### BTC ###
# Select the rows with empty commodity list
rows_to_update = signals['commodity'].apply(lambda x: x == [])
# Extract information between space and BTC (excluding the BTC string)
commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r' (\S+)BTC', x)][0] if re.findall(r' (\S+)BTC', x) else [])
# update the commodity column with the new values
signals.loc[rows_to_update, 'commodity'] = commodity_values
print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)]))
```
### General cleanup after
```
# eliminate from columns commodity all the #
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'#', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the letter that are not in the alphabet
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[^a-zA-Z]', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the letter that are not in capital letters
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[^A-Z]', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the characters after a /
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'/.*', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the characters after a ,
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r',.*', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the characters after a _ or -
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[_-].*', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the characters after a (
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'\(.*', '', i) for i in x] if len(x) == 0 else x)
# eliminate from columns commodity all the characters before a )
signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'.*\)', '', i) for i in x] if len(x) == 0 else x)
# eliminate all the string that have more than 5 characters
signals['commodity'] = signals['commodity'].apply(lambda x: [s for s in x if len(s) <= 5])
```
# for target price and entry prices
```
# Extract the buy zone value from the message_text column
buy_zone_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [str(re.findall(r'buy\s*zone\s*(\d+)', x, re.IGNORECASE)[0])] if re.findall(r'buy\s*zone\s*(\d+)', x, re.IGNORECASE) else [])
# Update the buy_zone column with the new values
signals.loc[rows_to_update, 'entry_price_range'] = buy_zone_values
# target_prices
# Select the rows with empty sell_zone column
rows_to_update = signals['target_prices'].apply(lambda x: x == [])
# Extract the sell zone values from the message_text column
sell_zone_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(value) for value in re.findall(r'sell\s*zone\s*(\d+(?:-\d+)*)', x, re.IGNORECASE)[0].split('-')] if re.findall(r'sell\s*zone\s*(\d+(?:-\d+)*)', x, re.IGNORECASE) else [])
# Update the sell_zone column with the new values
signals.loc[rows_to_update, 'target_prices'] = sell_zone_values
### entry prices and target price 2
# Select the rows with empty entry_price_range column
rows_to_update = signals['entry_price_range'].apply(lambda x: x == [])
# Extract the entry price range value from the message_text column
entry_price_range_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(re.findall(r'Buying Zone (\d+)-\d+', x)[0])] if re.findall(r'Buying Zone (\d+)-\d+', x) else [])
# Update the entry_price_range column with the new values
signals.loc[rows_to_update, 'entry_price_range'] = entry_price_range_values
# Select the rows with empty target_prices column
rows_to_update = signals['target_prices'].apply(lambda x: x == [])
# Extract the target prices from the message_text column
target_prices_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x:
[value for match in re.finditer(r'Sell\s*(?:🤑|🚀)?\s*([\d-]+)', x, re.IGNORECASE)
for value in match.group(1).split('-')]
if re.findall(r'Sell\s*(?:🤑|🚀)?\s*([\d-]+)', x, re.IGNORECASE) else []
)
# Update the target_prices column with the new values
signals.loc[rows_to_update, 'target_prices'] = target_prices_values
# Select the rows with empty entry_price_range column
rows_to_update = signals['entry_price_range'].apply(lambda x: x == [])
# Extract the entry price range values from the message_text column
entry_price_range_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(num) for num in re.findall(r'Entry Zone (\d+)-(\d+)', x)[0]] if re.findall(r'entry Zone (\d+)-(\d+)', x) else [])
# Update the entry_price_range column with the new values
signals.loc[rows_to_update, 'entry_price_range'] = entry_price_range_values
```