Skip to content

Instantly share code, notes, and snippets.

@estasney
estasney / rebuild_search_indices.py
Last active May 21, 2018 23:55 — forked from davebarkerxyz/rebuild_search_indices.py
Rebuild Flask-WhooshAlchemy search indices (Python 3, Mega-Tutorial Style)
#!/usr/bin/env python
import datetime
import sys
sys.path.append("mysite")
from app_folder import create_app
from app_folder.models import YOUR_MODEL
app = create_app()
# -*- coding: utf-8 -*-
import itertools
import re
import urlparse
import boto
import warc
from boto.s3.key import Key
from gzipstream import GzipStreamFile
@estasney
estasney / name_search.py
Last active August 15, 2018 16:52
Performance oriented string search across multiple datasets
from collections import OrderedDict
from operator import itemgetter
class NameData(object):
def __init__(self, data, name, priority, preprocessor=None):
self.data = self.structure_data(data)
self.name_set = self.generate_set(data)
self.name = name
self.priority = priority
@estasney
estasney / analyze_tags.py
Created June 29, 2018 01:47
Co-occurrence of StackOverflow Tags
# Given a StackOverflow tag, how often does it occur with other tags?
# See example query https://data.stackexchange.com/stackoverflow/query/868423/co-occurrence-of-tags
# Download CSV results
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
CSV_PATH = ""
TAG = ""
@estasney
estasney / extract_xml.py
Created July 13, 2018 01:46
Parsing XML of Stack Overflow Data Dumps
from gensim.utils import smart_open
from collections import defaultdict, OrderedDict
import csv
import xml.etree.ElementTree as ET
headers = ['AcceptedAnswerId', 'AnswerCount', 'ClosedDate', 'CommentCount', 'CommunityOwnedDate', 'CreationDate',
'FavoriteCount', 'Id', 'LastActivityDate', 'LastEditDate', 'LastEditorDisplayName', 'LastEditorUserId',
'OwnerDisplayName', 'OwnerUserId', 'ParentId', 'PostTypeId', 'Score', 'Tags', 'Title', 'ViewCount']
file_path = ""
@estasney
estasney / coloring.py
Created September 6, 2018 02:34
Pattern NLP
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
def node_type(x):
if isinstance(x, str):
return 2
@estasney
estasney / manifest.json
Created September 9, 2018 18:20 — forked from siumeiman/manifest.json
Download LinkedIn
{
  "manifest_version": 2,
  "name": "LinkedIn Profile Saver",
  "version": "1.0.0",
  "content_scripts": [{
    "matches": [
      "http://*.linkedin.com/in/*",
      "https://*.linkedin.com/in/*",
      "http://*.linkedin.com/profile/*",
      "https://*.linkedin.com/profile/*"
@estasney
estasney / WordMap.py
Last active October 15, 2018 02:16
WordMappings
from cytoolz import groupby
class WordPair(object):
PREFERRED = 'preferred'
OTHERS = 'others'
def __init__(self, preferred, others):
self.preferred = preferred
if isinstance(others, list):
@estasney
estasney / backoff.py
Last active January 25, 2021 15:55
Backoff Decorator - Accepting Parameters
import time
import random
class BackOffDecorator(object):
def __init__(self, max_tries, delay, backoff_rate):
self.max_tries = max_tries
self.tries = 0
self.delay = delay
self.backoff_rate = backoff_rate
import pandas as pd
import os
SPREADSHEET_FOLDER = r"C:\Users" # Which folder are these files located in?
OUTPUT_FILEPATH = r"C:\Users" # Where should it go?
if not os.path.isdir(SPREADSHEET_FOLDER):
raise NotADirectoryError
if '.xls' not in OUTPUT_FILEPATH and '.csv' not in OUTPUT_FILEPATH:
raise Exception("Output path must have ext of .csv, .xls, or .xlsx")