for commodities

Currently we have an average of 18000/20563 of messages withouth commodity detected when comparing channels with and withouth score. This issue is one of the impactfull data points for scoring channels. We detected some patterns that may give us a clue of detection improvements. We use this query to filter relevant cases: ``` SELECT commodity, channel_crowd_score, entry_price_range, target_prices, message_text FROM public.cloudburst_signals WHERE signal_type = 'buy' AND LENGTH(message_text) > 35 AND source_datetime < current_date - 20 AND commodity IS NULL AND channel_crowd_score IS NULL AND lower(message_text) NOT LIKE '%minutes%' AND lower(message_text) NOT LIKE '%hour%' AND lower(message_text) NOT LIKE '%hours%' AND lower(message_text) NOT LIKE '%mins%' AND lower(message_text) NOT LIKE '%challenge%' ORDER BY message_text ASC -- And with DESC And withouth ORDER BY LIMIT 1000 ``` # for commodities ### The use of # before the initials of the cripto. ![](https://i.imgur.com/v8Qkmjv.png) #### potential regex ``` # Select the rows with empty commodity list rows_to_update = signals['commodity'].apply(lambda x: x == []) # Extract information between space and BTC (excluding the BTC string) commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'#(\S+?)\s', x)][0] if re.findall(r'#(\S+?)\s', x) else []) # update the commodity column with the new values signals.loc[rows_to_update, 'commodity'] = commodity_values print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)])) ``` ### The use of $ before the initials of the cripto. ![](https://i.imgur.com/Gv21WY0.png) #### potential regex ``` # Select the rows with empty commodity list rows_to_update = signals['commodity'].apply(lambda x: x == []) # Extract information between space and BTC (excluding the BTC string) commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'\$(\S+)', x)][0] if re.findall(r'\$(\S+)', x) else []) # update the commodity column with the new values signals.loc[rows_to_update, 'commodity'] = commodity_values print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)])) ``` ### The use of # and a number before the initials of the cripto ![](https://i.imgur.com/jE6CEa7.png) #### potential regex ``` # Select the rows with empty commodity list rows_to_update = signals['commodity'].apply(lambda x: x == []) # Extract information between space and BTC (excluding the BTC string) commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r'[^#]*#([^#\s]+)', x)][0] if re.findall(r'[^#]*#([^#\s]+)', x) else []) # update the commodity column with the new values signals.loc[rows_to_update, 'commodity'] = commodity_values print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)])) ``` ### The use of secuencial coin initial with the pair (This may happend post a # too) ![](https://i.imgur.com/bliZmlR.png) ![](https://i.imgur.com/toMegSX.png) ![](https://i.imgur.com/2l2QX1g.png) ![](https://i.imgur.com/4X3ZyFy.png) #### potential regex Usually are all paired to USDT or BTC so: ``` ### USDT ### # Extract information between space and USDT (excluding the USDT string) signals['commodity'] = signals['message_text'].apply(lambda x: re.findall(r' (\S+)USDT', x)) # eliminate # and / from the extracted information signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[/#]', '', i) for i in x] if len(x) == 0 else x) # print the number of usdt_info that are not empty print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)])) ### BTC ### # Select the rows with empty commodity list rows_to_update = signals['commodity'].apply(lambda x: x == []) # Extract information between space and BTC (excluding the BTC string) commodity_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [re.findall(r' (\S+)BTC', x)][0] if re.findall(r' (\S+)BTC', x) else []) # update the commodity column with the new values signals.loc[rows_to_update, 'commodity'] = commodity_values print(len(signals[signals['commodity'].apply(lambda x: len(x) > 0)])) ``` ### General cleanup after ``` # eliminate from columns commodity all the # signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'#', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the letter that are not in the alphabet signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[^a-zA-Z]', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the letter that are not in capital letters signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[^A-Z]', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the characters after a / signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'/.*', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the characters after a , signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r',.*', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the characters after a _ or - signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'[_-].*', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the characters after a ( signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'$.*', '', i) for i in x] if len(x) == 0 else x) # eliminate from columns commodity all the characters before a ) signals['commodity'] = signals['commodity'].apply(lambda x: [re.sub(r'.*$', '', i) for i in x] if len(x) == 0 else x) # eliminate all the string that have more than 5 characters signals['commodity'] = signals['commodity'].apply(lambda x: [s for s in x if len(s) <= 5]) ``` # for target price and entry prices ``` # Extract the buy zone value from the message_text column buy_zone_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [str(re.findall(r'buy\s*zone\s*(\d+)', x, re.IGNORECASE)[0])] if re.findall(r'buy\s*zone\s*(\d+)', x, re.IGNORECASE) else []) # Update the buy_zone column with the new values signals.loc[rows_to_update, 'entry_price_range'] = buy_zone_values # target_prices # Select the rows with empty sell_zone column rows_to_update = signals['target_prices'].apply(lambda x: x == []) # Extract the sell zone values from the message_text column sell_zone_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(value) for value in re.findall(r'sell\s*zone\s*(\d+(?:-\d+)*)', x, re.IGNORECASE)[0].split('-')] if re.findall(r'sell\s*zone\s*(\d+(?:-\d+)*)', x, re.IGNORECASE) else []) # Update the sell_zone column with the new values signals.loc[rows_to_update, 'target_prices'] = sell_zone_values ### entry prices and target price 2 # Select the rows with empty entry_price_range column rows_to_update = signals['entry_price_range'].apply(lambda x: x == []) # Extract the entry price range value from the message_text column entry_price_range_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(re.findall(r'Buying Zone (\d+)-\d+', x)[0])] if re.findall(r'Buying Zone (\d+)-\d+', x) else []) # Update the entry_price_range column with the new values signals.loc[rows_to_update, 'entry_price_range'] = entry_price_range_values # Select the rows with empty target_prices column rows_to_update = signals['target_prices'].apply(lambda x: x == []) # Extract the target prices from the message_text column target_prices_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [value for match in re.finditer(r'Sell\s*(?:🤑|🚀)?\s*([\d-]+)', x, re.IGNORECASE) for value in match.group(1).split('-')] if re.findall(r'Sell\s*(?:🤑|🚀)?\s*([\d-]+)', x, re.IGNORECASE) else [] ) # Update the target_prices column with the new values signals.loc[rows_to_update, 'target_prices'] = target_prices_values # Select the rows with empty entry_price_range column rows_to_update = signals['entry_price_range'].apply(lambda x: x == []) # Extract the entry price range values from the message_text column entry_price_range_values = signals.loc[rows_to_update, 'message_text'].apply(lambda x: [int(num) for num in re.findall(r'Entry Zone (\d+)-(\d+)', x)[0]] if re.findall(r'entry Zone (\d+)-(\d+)', x) else []) # Update the entry_price_range column with the new values signals.loc[rows_to_update, 'entry_price_range'] = entry_price_range_values ```