Created
November 16, 2023 01:10
-
-
Save trojblue/06bb89e9f49512d19383ed5f3cfce2eb to your computer and use it in GitHub Desktop.
User fevercell_projects.json File extract all links from twitter or x.com from this json:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
# Function to extract handles from a given domain in a nested dictionary | |
def extract_handles(data, domain): | |
def find_handles(d): | |
handles = [] | |
for k, v in d.items(): | |
if isinstance(v, dict): | |
handles.extend(find_handles(v)) | |
elif isinstance(v, list): | |
for item in v: | |
if isinstance(item, dict): | |
handles.extend(find_handles(item)) | |
elif isinstance(item, str) and domain in item: | |
handle = item.split(domain)[1].split()[0] # Extract handle and handle cases with trailing text | |
if handle: # Check if there's a handle after the split | |
handles.append(handle) | |
elif isinstance(v, str) and domain in v: | |
handle = v.split(domain)[1].split()[0] # Extract handle and handle cases with trailing text | |
if handle: # Check if there's a handle after the split | |
handles.append(handle) | |
return handles | |
return find_handles(data) | |
# Function to read JSON and extract handles | |
def process_json(file_path, domain): | |
with open(file_path, 'r') as file: | |
data = json.load(file) | |
# Extracting handles | |
handles = extract_handles(data, domain) | |
# Removing duplicates | |
unique_handles = list(set(handles)) | |
return unique_handles | |
# Example usage | |
file_path = '/path/to/your/jsonfile.json' # Replace with your JSON file path | |
domain = 'twitter.com/' # Can be changed to other domains like 'pixiv.net/users/' | |
# Extracting Twitter handles | |
twitter_handles = process_json(file_path, domain) | |
# Saving to a text file | |
output_file_path = '/path/to/output/file.txt' # Replace with your desired output file path | |
with open(output_file_path, 'w') as file: | |
file.write('\n'.join(twitter_handles)) | |
# Output file path for reference | |
print(output_file_path) | |
# To change the extraction to another domain, just modify the 'domain' variable accordingly. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment