Last active
October 7, 2016 07:08
-
-
Save Huluk/cbe98c6a00e305a6ede268909547f398 to your computer and use it in GitHub Desktop.
TrustYou Application
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Programming task | |
================ | |
Implement the method iter_sample below to make the Unit test pass. iter_sample | |
is supposed to peek at the first n elements of an iterator, and determine the | |
minimum and maximum values (using their comparison operators) found in that | |
sample. To make it more interesting, the method is supposed to return an | |
iterator which will return the same exact elements that the original one would | |
have yielded, i.e. the first n elements can't be missing. | |
You may make use of Python's standard library. Python 3 is allowed, even though | |
it's not supported by codepad apparently. | |
Create your solution as a private fork, and send us the URL. | |
""" | |
from itertools import islice, chain, count | |
import unittest | |
def iter_sample(it, n): | |
""" | |
Peeks at the first n elements of an iterator and determine the min and max | |
values. Preserves all elements in the iterator. | |
@param it: Iterator, potentially infinite | |
@param n: Number of elements to peek off the iterator | |
@return: Tuple of minimum, maximum (in sample), and an iterator that yields | |
all elements that would have been yielded by the original iterator. | |
""" | |
if n > 0 and it: | |
visited = [it.next()] | |
minimum = visited[0] | |
maximum = visited[0] | |
for x in islice(it, n-1): | |
if x < minimum: | |
minimum = x | |
if x > maximum: | |
maximum = x | |
visited.append(x) | |
return minimum, maximum, chain(visited, it) | |
else: | |
return None, None, it | |
class StreamSampleTestCase(unittest.TestCase): | |
def test_smoke(self): | |
# sample only the first 10 elements of a range of length 100 | |
it = iter(range(100)) | |
min_val, max_val, new_it = iter_sample(it, 10) | |
self.assertEqual(0, min_val) | |
self.assertEqual(9, max_val) | |
# all elements are still there: | |
self.assertEqual(list(range(100)), list(new_it)) | |
def test_sample_all(self): | |
# sample more elements than there are - no error raised | |
# now we now the global maximum! | |
it = iter(range(100)) | |
min_val, max_val, new_it = iter_sample(it, 1000) | |
self.assertEqual(0, min_val) | |
self.assertEqual(99, max_val) | |
self.assertEqual(list(range(100)), list(new_it)) | |
def test_infinite_stream(self): | |
# and guess what - it also works with infinite iterators | |
it = count(0) | |
min_val, max_val, _ = iter_sample(it, 10) | |
self.assertEqual(0, min_val) | |
self.assertEqual(9, max_val) | |
if __name__ == "__main__": | |
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Programming task | |
================ | |
The following is an implementation of a simple Named Entity Recognition (NER). | |
NER is concerned with identifying place names, people names or other special | |
identifiers in text. | |
Here we make a very simple definition of a named entity: A sequence of | |
at least two consecutive capitalized words. E.g. "Los Angeles" is a named | |
entity, "our hotel" is not. | |
While the implementation passes the Unit test, it suffers from bad structure and | |
readability. It is your task to rework *both* the implementation and the Unit | |
test. You are expected to come up with a better interface than the one presented | |
here. | |
Your code will be evaluated on: | |
- Readability: Is naming intuitive? Are there comments where necessary? | |
- Structure: Is functionality grouped into functions or classes in a way that | |
enables reusability? | |
- Testability: Is it easy to test individual components of your algorithm? This | |
is a good indicator of good interface design. | |
- Bonus: Functional programming. Demonstrate how you have applied principles of | |
functional programming to improve this code. | |
If you want, explain reasons for changes you've made in comments. | |
Note that you don't have to improve the actual Named Entity Recognition | |
algorithm itself - the focus is on code quality. | |
""" | |
import re | |
import unittest | |
""" | |
Code Modifications | |
================== | |
My code splits functionality into the operations tokenize(text) and | |
named_entities(tokens). | |
I changed token_re to match only the token itself, rather than capturing the | |
whole rest of the string in the second group. This way, my tokenize function | |
can use lazy evaluation on the text and works better on longer input. | |
""" | |
# Regular expression for matching a token at the beginning of a sentence | |
token_re = re.compile(r"([a-z]+)\s*", re.I) | |
# Regular expression to recognize an uppercase token | |
uppercase_re = re.compile(r"[A-Z][a-z]*$") | |
def tokenize(text, token_re=token_re): | |
""" | |
Returns a generator for tokens in the given text. | |
""" | |
return (match.group(1) for match in token_re.finditer(text)) | |
def named_entities( | |
tokens, | |
is_entity_part=(lambda token: uppercase_re.match(token)), | |
min_repetitions=2): | |
""" | |
Extracts all named entities from the given token collection. | |
Named entities may span multiple tokens. | |
@param tokens: iterator over tokens | |
@param is_entity_part: function which takes a token and returns bool | |
@param min_repetitions: named entities consist of at least this many tokens | |
@return: a set of the named entities in the token list | |
""" | |
entities = set() | |
current_entity = [] | |
for token in tokens: | |
if is_entity_part(token): | |
current_entity.append(token) | |
else: | |
if len(current_entity) >= min_repetitions: | |
entities.add(' '.join(current_entity)) | |
current_entity = [] | |
if len(current_entity) >= min_repetitions: | |
entities.add(' '.join(current_entity)) | |
return entities | |
class NamedEntityTestCase(unittest.TestCase): | |
def test_ner_extraction(self): | |
text = "When we went to Los Angeles last year we visited the Hollywood Sign" | |
self.assertEqual(set(["Los Angeles", "Hollywood Sign"]), | |
named_entities(tokenize(text))) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment