Created
July 1, 2016 18:49
-
-
Save edglazer/35f1c33b65a85d75892eade0dbcad8c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Timer unit: 1e-06 s | |
Total time: 1401.2 s | |
File: tribe/extract.py | |
Function: extract_graph at line 105 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
105 @profile | |
106 def extract_graph(self): | |
107 """ | |
108 Extracts a Graph where the nodes are EmailAddress | |
109 """ | |
110 | |
111 1 3 3.0 0.0 def relationships(email): | |
112 """ | |
113 Inner function that constructs email relationships | |
114 """ | |
115 people = [email.sender,] | |
116 people.extend(email.recipients) | |
117 people.extend(email.copied) | |
118 | |
119 people = filter(lambda p: p is not None, people) # Filter out any None addresses | |
120 people = set(addr.email for addr in people if addr.email) # Obtain only unique people | |
121 people = sorted(people) # Sort lexicographically for combinations | |
122 | |
123 for combo in combinations(people, 2): | |
124 yield combo | |
125 | |
126 | |
127 # Keep track of all the email to email links | |
128 1 18 18.0 0.0 links = FreqDist() | |
129 | |
130 # Iterate over all the extracted emails | |
131 # Catch exceptions, if any, and move forward | |
132 # NOTE: This will allow the progress bar to work | |
133 # NOTE: This will build the graph data structure in memory | |
134 114956 1159836727 10089.4 82.8 for email in self.extract(): | |
135 114955 136897 1.2 0.0 try: | |
136 4870025 7657826 1.6 0.5 for combo in relationships(email): | |
137 4755070 7686760 1.6 0.5 links[combo] += 1 | |
138 except Exception as e: | |
139 self.errors[e] += 1 | |
140 continue | |
141 | |
142 # Construct the networkx graph and add edges | |
143 1 66 66.0 0.0 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) | |
144 4173 67408 16.2 0.0 for link in links.keys(): | |
145 4173 225812988 54112.9 16.1 G.add_edge(*link, weight=links.freq(link)) | |
146 | |
147 # Return the generated graph | |
148 return G | |
➜ tribe git:(master) ✗ pip install -U memory_profiler | |
Requirement already up-to-date: memory_profiler in /usr/local/lib/python2.7/site-packages | |
➜ tribe git:(master) ✗ pip install psutil | |
Requirement already satisfied (use --upgrade to upgrade): psutil in /usr/local/lib/python2.7/site-packages | |
➜ tribe git:(master) ✗ pip install -U psutil | |
Requirement already up-to-date: psutil in /usr/local/lib/python2.7/site-packages | |
➜ tribe git:(master) ✗ kernprof --help | |
Usage: kernprof [-s setupfile] [-o output_file_path] scriptfile [arg] ... | |
Options: | |
--version show program's version number and exit | |
-h, --help show this help message and exit | |
-l, --line-by-line Use the line-by-line profiler from the line_profiler | |
module instead of Profile. Implies --builtin. | |
-b, --builtin Put 'profile' in the builtins. Use 'profile.enable()' | |
and 'profile.disable()' in your code to turn it on and | |
off, or '@profile' to decorate a single function, or | |
'with profile:' to profile a single section of code. | |
-o OUTFILE, --outfile=OUTFILE | |
Save stats to <outfile> | |
-s SETUP, --setup=SETUP | |
Code to execute before the code to profile | |
-v, --view View the results of the profile in addition to saving | |
it. | |
➜ tribe git:(master) ✗ python -m memory_profiler tribe-admin.py extract -w myemails.graphml allmail.mbox | |
Starting Graph extraction, a long running process | |
Initializing MBox iteration on allmail.mbox (8.8GiB) | |
^CFilename: tribe/extract.py seconds | Parsed: 97431 emails | |
Line # Mem usage Increment Line Contents | |
================================================ | |
105 70.691 MiB 0.000 MiB @profile | |
106 def extract_graph(self): | |
107 """ | |
108 Extracts a Graph where the nodes are EmailAddress | |
109 """ | |
110 | |
111 490.387 MiB 419.695 MiB def relationships(email): | |
112 """ | |
113 Inner function that constructs email relationships | |
114 """ | |
115 490.387 MiB 0.000 MiB people = [email.sender,] | |
116 490.387 MiB 0.000 MiB people.extend(email.recipients) | |
117 490.387 MiB 0.000 MiB people.extend(email.copied) | |
118 | |
119 490.387 MiB 0.000 MiB people = filter(lambda p: p is not None, people) # Filter out any None addresses | |
120 490.387 MiB 0.000 MiB people = set(addr.email for addr in people if addr.email) # Obtain only unique people | |
121 490.387 MiB 0.000 MiB people = sorted(people) # Sort lexicographically for combinations | |
122 | |
123 490.387 MiB 0.000 MiB for combo in combinations(people, 2): | |
124 490.387 MiB 0.000 MiB yield combo | |
125 | |
126 | |
127 # Keep track of all the email to email links | |
128 70.691 MiB -419.695 MiB links = FreqDist() | |
129 | |
130 # Iterate over all the extracted emails | |
131 # Catch exceptions, if any, and move forward | |
132 # NOTE: This will allow the progress bar to work | |
133 # NOTE: This will build the graph data structure in memory | |
134 490.387 MiB 419.695 MiB for email in self.extract(): | |
135 490.387 MiB 0.000 MiB try: | |
136 490.387 MiB 0.000 MiB for combo in relationships(email): | |
137 490.387 MiB 0.000 MiB links[combo] += 1 | |
138 except Exception as e: | |
139 self.errors[e] += 1 | |
140 continue | |
141 | |
142 # Construct the networkx graph and add edges | |
143 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) | |
144 for link in links.keys(): | |
145 G.add_edge(*link, weight=links.freq(link)) | |
146 | |
147 # Return the generated graph | |
148 return G | |
Traceback (most recent call last): | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main | |
"__main__", fname, loader, pkg_name) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code | |
exec code in run_globals | |
File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 982, in <module> | |
exec_with_profiler(script_filename, prof) | |
File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 917, in exec_with_profiler | |
execfile(filename, ns, ns) | |
File "tribe-admin.py", line 179, in <module> | |
main(*sys.argv[1:]) | |
File "tribe-admin.py", line 173, in main | |
msg = args.func(args) # Call the default function | |
File "tribe-admin.py", line 91, in extract | |
errors, seconds = timed_inner(args.mbox[0], args.write) | |
File "tribe/utils.py", line 118, in wrapper | |
result = func(*args, **kwargs) | |
File "tribe-admin.py", line 86, in timed_inner | |
G = reader.extract_graph() | |
File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 498, in f | |
return func(*args, **kwds) | |
File "tribe/extract.py", line 134, in extract_graph | |
for email in self.extract(): | |
File "tribe/extract.py", line 97, in extract | |
for msg in self: | |
File "tribe/extract.py", line 177, in __iter__ | |
for msg in super(ConsoleMBoxReader, self).__iter__(): | |
File "tribe/extract.py", line 46, in __iter__ | |
for msg in self.mbox: | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 108, in itervalues | |
value = self[key] | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 80, in __getitem__ | |
return self.get_message(key) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 775, in get_message | |
msg = self._message_factory(string.replace(os.linesep, '\n')) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 1598, in __init__ | |
Message.__init__(self, message) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/mailbox.py", line 1459, in __init__ | |
self._become_message(email.message_from_string(message)) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/__init__.py", line 57, in message_from_string | |
return Parser(*args, **kws).parsestr(s) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/parser.py", line 82, in parsestr | |
return self.parse(StringIO(text), headersonly=headersonly) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/parser.py", line 71, in parse | |
feedparser.feed(data) | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 178, in feed | |
self._call_parse() | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 182, in _call_parse | |
self._parse() | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 373, in _parsegen | |
for retval in self._parsegen(): | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 446, in _parsegen | |
if line is NeedMoreData: | |
File "/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/email/feedparser.py", line 446, in _parsegen | |
if line is NeedMoreData: | |
File "/usr/local/lib/python2.7/site-packages/memory_profiler.py", line 537, in trace_memory_usage | |
def trace_memory_usage(self, frame, event, arg): | |
KeyboardInterrupt | |
Exception KeyboardInterrupt in <module 'threading' from '/usr/local/Cellar/python/2.7.10_2/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.pyc'> ignored | |
^X% ➜ tribe git:(master) ✗ | |
➜ tribe git:(master) ✗ kernprof -l tribe-admin.py extract -w myemails.graphml allmail.mbox | |
Starting Graph extraction, a long running process | |
Initializing MBox iteration on allmail.mbox (8.8GiB) | |
^CWrote profile results to tribe-admin.py.lprof. | |
Elapsed: 14 minutes 41 seconds | initializing ... | |
Session Restored | |
Last login: Fri Jul 1 11:44:11 on console | |
You have new mail. | |
➜ tribe git:(master) ✗ kernprof -l tribe-admin.py extract -w myemails.graphml allmail.mbox | |
Starting Graph extraction, a long running process | |
Initializing MBox iteration on allmail.mbox (8.8GiB) | |
Elapsed: 18 minutes 35 seconds | Parsed: 114856 emails | |
GraphML written out to myemails.graphml | |
No errors encountered in processing | |
Graph extraction took 19 minutes 44 seconds | |
Wrote profile results to tribe-admin.py.lprof | |
➜ tribe git:(master) ✗ python -m line_profiler tribe-admin.py.lprof | |
Timer unit: 1e-06 s | |
Total time: 1119.33 s | |
File: tribe/extract.py | |
Function: extract_graph at line 105 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
105 @profile | |
106 def extract_graph(self): | |
107 """ | |
108 Extracts a Graph where the nodes are EmailAddress | |
109 """ | |
110 | |
111 1 13 13.0 0.0 def relationships(email): | |
112 """ | |
113 Inner function that constructs email relationships | |
114 """ | |
115 people = [email.sender,] | |
116 people.extend(email.recipients) | |
117 people.extend(email.copied) | |
118 | |
119 people = filter(lambda p: p is not None, people) # Filter out any None addresses | |
120 people = set(addr.email for addr in people if addr.email) # Obtain only unique people | |
121 people = sorted(people) # Sort lexicographically for combinations | |
122 | |
123 for combo in combinations(people, 2): | |
124 yield combo | |
125 | |
126 | |
127 # Keep track of all the email to email links | |
128 1 8 8.0 0.0 links = FreqDist() | |
129 | |
130 # Iterate over all the extracted emails | |
131 # Catch exceptions, if any, and move forward | |
132 # NOTE: This will allow the progress bar to work | |
133 # NOTE: This will build the graph data structure in memory | |
134 114956 1094585725 9521.8 97.8 for email in self.extract(): | |
135 114955 116953 1.0 0.0 try: | |
136 4870025 7024773 1.4 0.6 for combo in relationships(email): | |
137 4755070 7022225 1.5 0.6 links[combo] += 1 | |
138 except Exception as e: | |
139 self.errors[e] += 1 | |
140 continue | |
141 | |
142 # Construct the networkx graph and add edges | |
143 1 68 68.0 0.0 G = nx.Graph(name="Email Network", mbox=self.path, extracted=strfnow()) | |
144 1837532 1518627 0.8 0.1 for link in links.keys(): | |
145 G.add_edge(*link, weight=links.freq(link)) # took out Changed per suggestion of @bbengfort, as calcultion may be too strenuous | |
146 1837531 9065502 4.9 0.8 # G.add_edge(*link, weight=links[link]) | |
147 | |
148 # Return the generated graph | |
149 1 3 3.0 0.0 return G |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment