import json from datetime import datetime from loguru import logger def parse_timeline(data): entries = data["data"]["list"]["tweets_timeline"]["timeline"]["instructions"][0]["entries"] result = [] for entry in entries: try: result += parse_entry(entry) except Exception as e: logger.error(f"error when parsing entry: {e} {e.args}\n{entry}") result.sort(key=lambda x: x["timestamp"], reverse=True) return result def parse_entry(entry): result = [] entry_id = entry["entryId"] if "list-conversation" in entry_id and not "tweet" in entry_id: for item in entry["content"]["items"]: data = parse_content(item["item"]) if data: result.append(data) elif entry["content"]["__typename"] != 'TimelineTimelineCursor': data = parse_content(entry["content"]) if data: result.append(data) return result def parse_content(content): tweet = content["itemContent"]["tweet_results"]["result"] while not "rest_id" in tweet: tweet = tweet["tweet"] try: data = parse_tweet(tweet) if "quoted_status_result" in tweet: data["quoted"] = parse_tweet(tweet["quoted_status_result"]["result"]) if "retweeted_status_result" in tweet["legacy"]: data["retweeted"] = parse_tweet(tweet["legacy"]["retweeted_status_result"]["result"]) return data except Exception as e: logger.error(f"error when parsing tweet: {e} {e.args}\n{tweet}") return {} def parse_media(media): data = { "url": media["media_url_https"] + "?name=orig", "video": "" } if media["type"] in ["video", "animated_gif"]: variants = [i for i in media["video_info"]["variants"] if "bitrate" in i] variants.sort(key=lambda x: x["bitrate"], reverse=True) if variants: data["video"] = variants[0]["url"] return data def parse_entities(entity): data = { "text": "", "indices": entity["indices"] } if "name" in entity: data["text"] = "@" + entity["name"] if "text" in entity: data["text"] = "#" + entity["text"] if "display_url" in entity: data["text"] = entity["display_url"] return data def parse_card(card): data = {} for v in card["legacy"]["binding_values"]: if "choice" in v["key"] or v["key"] in [ "end_datetime_utc", "unified_card", "summary_photo_image_original"]: value_name = f"{v['value']['type'].lower()}_value" data[v["key"]] = v['value'].get(value_name, "") photo = None if "unified_card" in data: card_data = json.loads(data["unified_card"]) del data["unified_card"] try: for k, v in card_data["media_entities"].items(): if "media_url_https" in v: photo = { "url": v["media_url_https"] + "?name=orig", "video": "" } break except: logger.error(f"error parsing unified_card {card_data}") if "summary_photo_image_original" in data: photo = { "url": data["summary_photo_image_original"]["url"], "video": "" } del data["summary_photo_image_original"] return data, photo def parse_tweet(tweet): # with open("tweet.json", "w") as f: json.dump(tweet, f) while not "rest_id" in tweet: tweet = tweet["tweet"] data = { "rest_id": tweet["rest_id"], "name": tweet["core"]["user_results"]["result"]["legacy"]["name"], "screen_name": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"], "profile_image": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"], "profile_image_shape": tweet["core"]["user_results"]["result"]["profile_image_shape"], "full_text": tweet["legacy"]["full_text"], "created_at": tweet["legacy"]["created_at"], "timestamp": int(datetime.strptime(tweet["legacy"]["created_at"], '%a %b %d %H:%M:%S %z %Y').timestamp()), "reply_to": "", "media": [], "entities": [], "quoted": {}, "retweeted": {}, "card": {} } data["profile_image"] = data["profile_image"].replace("_normal.", ".") if "in_reply_to_status_id_str" in tweet["legacy"]: data["reply_to"] = tweet["legacy"]["in_reply_to_status_id_str"] for m in tweet["legacy"]["entities"].get("media", []): data["media"].append(parse_media(m)) for e in ["user_mentions", "hashtags", "urls"]: for m in tweet["legacy"]["entities"].get(e, []): data["entities"].append(parse_entities(m)) data["entities"].sort(key=lambda x: x["indices"][0]) if "card" in tweet: data["card"], _photo = parse_card(tweet["card"]) if _photo: data["media"].append(_photo) return data