Last active
July 26, 2023 01:13
-
-
Save NewscatcherAPI/501ba40acebd17bf2660f8919f50550e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def is_company_acquisition(headline_doc): | |
# check if the acquisition lemma (why lemma?) | |
for token in headline_doc: | |
if 'acquire' not in [token.lemma_ for token in headline_doc]: | |
return False | |
# check that at least 2 ORG entities | |
elif len([ent.label_ for ent in headline_doc.ents if ent.label_ == 'ORG']) < 2: | |
return False | |
return True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def detect_acquisition(headline_doc): | |
if is_company_acquisition(headline_doc): | |
if all(t is not None for t in [find_acquired(headline_doc), find_acquirer(headline_doc)]): | |
print(str(headline_doc) + " --> " + str(find_acquirer(headline_doc)) + " acquires " + str(find_acquired(headline_doc))) | |
else: | |
print(str(headline_doc) + " --> " + "no acquisition detected") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = nlp("PayPal decided to acquire Paidy Inc. and Microsoft") | |
spacy.displacy.serve(doc, style= "dep") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_acquired(headline_doc): | |
acquired_list = [] | |
for token in headline_doc: | |
if (token.ent_type_ == 'ORG') and (token.head.lemma_ == 'acquire') and (token.dep_ in ('attr', 'dobj')): | |
for noun_chunk in doc.noun_chunks: | |
if token in noun_chunk: | |
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list | |
elif len(list(token.rights)) > 0: | |
for tright in list(token.rights): | |
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')): | |
for noun_chunk in doc.noun_chunks: | |
if tright in noun_chunk: | |
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list | |
return acquired_list |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_acquirer(headline_doc): | |
acquirer_list = [] | |
for token in headline_doc: | |
if (token.ent_type_ == 'ORG') and (token.dep_ in ('nsubj', 'ROOT')): | |
for noun_chunk in doc.noun_chunks: | |
if token in noun_chunk: | |
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list | |
elif len(list(token.rights)) > 0: | |
for tright in list(token.rights): | |
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')): | |
for noun_chunk in doc.noun_chunks: | |
if tright in noun_chunk: | |
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list | |
if acquirer_list != []: | |
return acquirer_list |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
nlp = spacy.load('en_core_web_lg') | |
# 2. preset headlines | |
headlines = ["PayPal to acquire Paidy Inc.", | |
"Pump solutions and water technology company Grundfos has entered into an agreement to acquire Mechanical Equipment Company (MECO) for an undisclosed sum.", | |
"Cavaliers acquire Lauri Markkanen", | |
"Mastercard to acquire CipherTrace", | |
"TransUnion to acquire Neustar and Newstar", | |
"Microsoft enters into agreement to acquire Myhotelshop to help hotels optimize guest acquisition", | |
"CDK to help car buyers acquire insurance", | |
"Tyber Medical reaches agreement to acquire CatapultMD", | |
"Rebels acquire forward Stevenson from Blades", | |
"Google to acquire 1.3M-SF Manhattan office for $2.1B", | |
"PayPal decided to acquire Paidy Inc. and Microsoft", | |
] | |
docs = [nlp(headline) for headline in headlines] | |
for doc in docs: | |
print([(ent.text, ent.label_) for ent in doc.ents]) | |
def is_company_acquisition(headline_doc): | |
# check if the acquisition lemma (why lemma?) | |
for token in headline_doc: | |
if 'acquire' not in [token.lemma_ for token in headline_doc]: | |
return False | |
# check that at least 2 ORG entities | |
elif len([ent.label_ for ent in headline_doc.ents if ent.label_ == 'ORG']) < 2: | |
return False | |
return True | |
# 4. find the dependency | |
# 4.1 one of a company should be in a dependancy | |
def find_acquired(headline_doc): | |
acquired_list = [] | |
for token in headline_doc: | |
if (token.ent_type_ == 'ORG') and (token.head.lemma_ == 'acquire') and (token.dep_ in ('attr', 'dobj')): | |
for noun_chunk in doc.noun_chunks: | |
if token in noun_chunk: | |
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list | |
elif len(list(token.rights)) > 0: | |
for tright in list(token.rights): | |
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')): | |
for noun_chunk in doc.noun_chunks: | |
if tright in noun_chunk: | |
acquired_list.append(noun_chunk) if noun_chunk not in acquired_list else acquired_list | |
if acquired_list != []: | |
return acquired_list | |
# 5. find the acquirer | |
def find_acquirer(headline_doc): | |
acquirer_list = [] | |
for token in headline_doc: | |
if (token.ent_type_ == 'ORG') and (token.dep_ in ('nsubj', 'ROOT')): | |
for noun_chunk in doc.noun_chunks: | |
if token in noun_chunk: | |
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list | |
elif len(list(token.rights)) > 0: | |
for tright in list(token.rights): | |
if (tright.ent_type_ == 'ORG') and (tright.dep_ in ('conj')): | |
for noun_chunk in doc.noun_chunks: | |
if tright in noun_chunk: | |
acquirer_list.append(noun_chunk) if noun_chunk not in acquirer_list else acquirer_list | |
if acquirer_list != []: | |
return acquirer_list | |
# 6. combine to find who acquiring | |
def detect_acquisition(headline_doc): | |
if is_company_acquisition(headline_doc): | |
if all(t is not None for t in [find_acquired(headline_doc), find_acquirer(headline_doc)]): | |
print(str(headline_doc) + " --> " + str(find_acquirer(headline_doc)) + " acquires " + str(find_acquired(headline_doc))) | |
else: | |
print(str(headline_doc) + " --> " + "no acquisition detected") | |
# showcase | |
for headline in headlines: | |
doc = nlp(headline) | |
detect_acquisition(doc) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
headlines = ["PayPal to acquire Paidy Inc.", | |
"Pump solutions and water technology company Grundfos has entered into an agreement to acquire Mechanical Equipment Company (MECO) for an undisclosed sum.", | |
"Cavaliers acquire Lauri Markkanen", | |
"Mastercard to acquire CipherTrace", | |
"TransUnion to acquire Neustar and Newstar", | |
"Microsoft enters into agreement to acquire Myhotelshop to help hotels optimize guest acquisition", | |
"CDK to help car buyers acquire insurance", | |
"Tyber Medical reaches agreement to acquire CatapultMD", | |
"Rebels acquire forward Stevenson from Blades", | |
"Google to acquire 1.3M-SF Manhattan office for $2.1B", | |
"PayPal decided to acquire Paidy Inc. and Microsoft", | |
] | |
docs = [nlp(headline) for headline in headlines] | |
for doc in docs: | |
print([(ent.text, ent.label_) for ent in doc.ents]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from newscatcherapi import NewsCatcherApiClient | |
newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR_API_KEY') | |
acquisition_articles = newscatcherapi.get_search(q='acquire', | |
search_in = 'title', | |
lang='en', | |
from_='24 hours ago', | |
sources='prnewswire.com, businesswire.com', | |
page_size=100, | |
page=1) | |
for article in acquisition_articles['articles']: | |
print(article['title']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import time | |
nlp = spacy.load('en_core_web_lg') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for headline in headlines: | |
doc = nlp(headline) | |
detect_acquisition(doc) | |
time.sleep(0.5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment