# HW1 Twitter Crawler with API and requests ###### tags: `HW1` ## Code - Read from dataset ```python= import pandas as pd import json from pandas import json_normalize import numpy as np with open('wtwt_ids.json', newline='') as jsonfile: data = json.load(jsonfile) df = json_normalize(data) print(df) ``` ``` tweet_id merger stance 0 971761970117357568 CI_ESRX support 1 950934259371520000 CI_ESRX unrelated 2 973718376496357377 CI_ESRX comment 3 996772902006599680 CI_ESRX support 4 971712098253320193 CI_ESRX support ... ... ... ... 51279 928683906731270144 FOXA_DIS comment 51280 950566340926099456 FOXA_DIS unrelated 51281 927233376427311104 FOXA_DIS unrelated 51282 952235091010506752 FOXA_DIS comment 51283 902139974732070912 FOXA_DIS unrelated [51284 rows x 3 columns] ``` - Query with API ```python= import requests import os import json def create_url(ids): tweet_fields = "tweet.fields=lang,author_id,source" # Tweet fields are adjustable. # Options include: # attachments, author_id, context_annotations, # conversation_id, created_at, entities, geo, id, # in_reply_to_user_id, lang, non_public_metrics, organic_metrics, # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets, # source, text, and withheld # You can adjust ids to include a single Tweets. # Or you can add to up to 100 comma-separated IDs url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields) return url def bearer_oauth(r): """ Method required by bearer token authentication. """ bearer_token = "AAAAAAAAAAAAAAAAAAAAANEragEAAAAAZT%2BSK0kRnIRQhO4%2B6yHhtuak%2Bm8%3D4J7AI0tZU3iHpZ8UtIeqklkSqR5naZgoQNWKBLlzRu9wGqhhgo" r.headers["Authorization"] = f"Bearer {bearer_token}" r.headers["User-Agent"] = "v2TweetLookupPython" return r def connect_to_endpoint(url): response = requests.request("GET", url, auth=bearer_oauth) print(response.status_code) if response.status_code != 200: raise Exception( "Request returned an error: {} {}".format( response.status_code, response.text ) ) return response.json() ``` Tweets lookup with ids ```python= # add 100 ids to query str id_list = df[:100]['tweet_id'] ids = 'ids='+",".join(str(id) for id in id_list) print(ids) ``` ``` ids=971761970117357568,950934259371520000,973718376496357377,996772902006599680,971712098253320193,979498827966279680,974142318990127104,987599698021289989,993307717929390081,979222181568811010,973604160364048384,973332326087262209,989159993718005760,877474140260839426,941785332189683712,972639407630110720,971905248460771329,973921707361697792,973642738615795712,980094009086377984,971800468606345218,993579961410768896,972367011836809217,972076128801296384,975862000898199555,973947350740819968,976338532963766272,981915145961029633,971742147308220416,983139990325682177,965428549934108672,976235715158773760,974315379865587712,972610531537272832,988572478846001153,990953940782108672,972286864181755904,971993749306798080,971785264971497472,921364009311440897,975886742434865154,989098445590974464,971758153057624065,935159657966063616,971773356411244544,941887307191406592,971789276299038720,981335581912387585,971692212139409409,971800684751392770,972081179280031749,972113059073417216,973152487904628736,908828896937824256,972484861264097280,986969164223926272,974051695478099969,973122146133250048,974374689383567362,973540031494344705,917769869105524737,975981996617936896,975866390036008961,974684725083287552,971866708557283328,931552384802852865,972708502937378816,971755509345865728,972483483632316417,931185394896343040,953266794042556416,979706119223480320,973109223658872834,973726335460888577,974722426645905408,990954223121682440,981191844422971393,972615362146635776,946042139397513217,973083702917062656,975800342905573376,972518214465863686,984892966232829952,971796004998668288,971734300486983680,978580700533202944,988536839714820097,972133710081310720,923171852201156608,977198501238067200,971766737300545538,884744121021153280,973564406184448000,971758871210528774,974665180813709312,975559993528324102,943170920658751490,972143523922288640,982315443787624453,992093382473236480 ``` send query with 100 ids ```python= # query with requests url = create_url(ids) json_response = connect_to_endpoint(url) print(json.dumps(json_response, indent=4, sort_keys=True)) ``` 14 tweets not found ```python= # tweets not found print(json_normalize(json_response['errors'])) ``` ``` value detail \ 0 973604160364048384 Could not find tweet with ids: [97360416036404... 1 973921707361697792 Could not find tweet with ids: [97392170736169... 2 981915145961029633 Could not find tweet with ids: [98191514596102... 3 971692212139409409 Could not find tweet with ids: [97169221213940... 4 972081179280031749 Sorry, you are not authorized to see the Tweet... 5 972484861264097280 Sorry, you are not authorized to see the Tweet... 6 974374689383567362 Could not find tweet with ids: [97437468938356... 7 981191844422971393 Sorry, you are not authorized to see the Tweet... 8 972518214465863686 Could not find tweet with ids: [97251821446586... 9 978580700533202944 Sorry, you are not authorized to see the Tweet... 10 988536839714820097 Sorry, you are not authorized to see the Tweet... 11 923171852201156608 Could not find tweet with ids: [92317185220115... 12 975559993528324102 Could not find tweet with ids: [97555999352832... 13 972143523922288640 Could not find tweet with ids: [97214352392228... title resource_type parameter resource_id \ 0 Not Found Error tweet ids 973604160364048384 1 Not Found Error tweet ids 973921707361697792 2 Not Found Error tweet ids 981915145961029633 3 Not Found Error tweet ids 971692212139409409 4 Authorization Error tweet ids 972081179280031749 5 Authorization Error tweet ids 972484861264097280 6 Not Found Error tweet ids 974374689383567362 7 Authorization Error tweet ids 981191844422971393 8 Not Found Error tweet ids 972518214465863686 9 Authorization Error tweet ids 978580700533202944 10 Authorization Error tweet ids 988536839714820097 11 Not Found Error tweet ids 923171852201156608 12 Not Found Error tweet ids 975559993528324102 13 Not Found Error tweet ids 972143523922288640 type section 0 https://api.twitter.com/2/problems/resource-no... NaN 1 https://api.twitter.com/2/problems/resource-no... NaN 2 https://api.twitter.com/2/problems/resource-no... NaN 3 https://api.twitter.com/2/problems/resource-no... NaN 4 https://api.twitter.com/2/problems/not-authori... data 5 https://api.twitter.com/2/problems/not-authori... data 6 https://api.twitter.com/2/problems/resource-no... NaN 7 https://api.twitter.com/2/problems/not-authori... data 8 https://api.twitter.com/2/problems/resource-no... NaN 9 https://api.twitter.com/2/problems/not-authori... data 10 https://api.twitter.com/2/problems/not-authori... data 11 https://api.twitter.com/2/problems/resource-no... NaN 12 https://api.twitter.com/2/problems/resource-no... NaN 13 https://api.twitter.com/2/problems/resource-no... NaN ``` 86 tweets found ```python= # tweets found print(json_normalize(json_response['data'])) ``` ``` source id author_id \ 0 Twitter for iPhone 971761970117357568 3118851863 1 Twitter Web Client 950934259371520000 4784471557 2 TwInbox 973718376496357377 15248550 3 Twitter Web Client 996772902006599680 25998862 4 Twitter Web Client 971712098253320193 14949738 .. ... ... ... 81 Twitter Web Client 971758871210528774 19966431 82 Twitter Web Client 974665180813709312 14662269 83 Twitter Web Client 943170920658751490 50729595 84 Twitter Web Client 982315443787624453 28730834 85 jetlifepennystocks 992093382473236480 555296885 text lang 0 Cigna and ESI set to merge. Here we go... en 1 Express Scripts Closes Acquisition Of eviCore;... en 2 RT @_diginsurance: Cigna-Express Scripts deal ... en 3 Here's the just-released 400+ page merger prox... en 4 Cigna nears deal for Express Scripts https://t... en .. ... ... 81 @ExpressScripts $ESRX surges 14% after $CI say... en 82 In this week's #HealthTech round up @Cigna rol... en 83 Holland & Knight provides regulatory couns... en 84 Hey, remember how DOJ blocked Anthem's propose... en 85 Cigna CEO: There's a lot of noise around regul... en [86 rows x 5 columns] ```