In [55]:
import spacy
import numpy as np
import pandas as pd

In [56]:
df = pd.read_csv('cleaned_pudding_public_scripts.csv')

In [57]:
nlp = spacy.load('en_core_web_sm')
# Or use the default model, which has fewer features:
# nlp = spacy.load('en')

In [58]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [59]:
doc.ents

(Apple, U.K., $1 billion)

In [60]:
# [prop for prop in dir(doc) if not prop.startswith('_')]

In [61]:
first_word = doc[0]
type(first_word), first_word

(spacy.tokens.token.Token, Apple)

In [62]:
from spacy import displacy
displacy.render(doc, style="dep")


In [63]:
kill_bill_script = df[df.title.str.contains('Kill')].script.values[0]

In [64]:
doc = nlp(kill_bill_script)

In [65]:
# for ent in doc.ents:
#     if ent.label_ == 'GPE':
#         print(ent.text, ent.label_)

In [66]:
# from spacy import displacy
# displacy.render(doc, style="ent")

In [67]:
import warnings
warnings.filterwarnings("ignore")

In [68]:
def get_entities(row, type_entity):
    doc = nlp(row.script)
    entities =[]
    # entities = [ent for ent in doc.ents if ent.label_ == type_entity]
    for ent in doc.ents:
        if ent.label_ == type_entity:
            entities.append(ent.text)
    return entities
    
subset_df = df[df.title.str.contains('Twilight')][0:1]

subset_df['identified_entities'] = subset_df.apply(get_entities, axis=1, type_entity='PERSON')

In [73]:
# print(df[df.title.str.contains('Twilight')][0:1].script.values[0])

In [74]:
character_list_df = pd.read_csv('character_list5.csv', encoding='latin-1')
metadata_df = pd.read_csv('meta_data7.csv', encoding='latin-1')
character_mapping_df = pd.read_csv('character_mapping.csv', encoding='latin-1')

In [77]:
merged_metadata_character_df = character_list_df.merge(metadata_df, on='script_id', how='left')

In [78]:
merged_metadata_character_df.head()

Unnamed: 0,script_id,imdb_character_name,words,gender,age,imdb_id,title,year,gross,lines_data
0,280,betty,311,f,35.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
1,280,carolyn johnson,873,f,,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
2,280,eleanor,138,f,,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
3,280,francesca johns,2251,f,46.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
4,280,madge,190,f,46.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...


In [79]:
# subset_exploded_df = subset_df.explode('identified_entities')
# subset_exploded_df = subset_exploded_df.drop_duplicates()

# grouped_character_list = merged_filmscripts_metadata_df.groupby(['title'])['imdb_character_name'].apply(list).reset_index(name='character_list')

# grouped_merged_character_list = merged_filmscripts_metadata_df.merge(grouped_character_list, on='title', how='left')

merged_filmscripts_metadata_df = subset_df.merge(merged_metadata_character_df, on=['script_id', 'year', 'title', 'imdb_id'], how='left')


merged_filmscripts_metadata_df.head()

Unnamed: 0,imdb_id,script_id,title,year,gross_ia,link,status_code,script,script_length,identified_entities,imdb_character_name,words,gender,age,gross,lines_data
0,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",alice cullen,1200,f,22.0,344.0,1452445303446332253244454134430204464354443254...
1,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",angela,138,f,19.0,344.0,1452445303446332253244454134430204464354443254...
2,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",aro,442,m,40.0,344.0,1452445303446332253244454134430204464354443254...
3,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",bella swan,6542,f,19.0,344.0,1452445303446332253244454134430204464354443254...
4,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",charlie swan,878,m,43.0,344.0,1452445303446332253244454134430204464354443254...


In [83]:
identified_characters = merged_filmscripts_metadata_df.imdb_character_name.values.tolist()

In [89]:
patterns = []
for name in identified_characters:
    if len(name.split(' ')) == 1:
        pattern = {"label": "PERSON", "pattern": [{"LOWER": f"{name}"}], "id": f"{name}"}
        patterns.append(pattern)
    elif len(name.split(' ')) > 1:
        pattern = {"label": "PERSON", "pattern": [{"LOWER": f"{name.split(' ')[0]}"}, {"LOWER": f"{name.split(' ')[1]}"}], "id": f"{'-'.join(name.split(' '))}"}
        patterns.append(pattern)

In [91]:
patterns[0:1]

[{'label': 'PERSON',
  'pattern': [{'LOWER': 'alice'}, {'LOWER': 'cullen'}],
  'id': 'alice-cullen'}]

In [92]:
list_names = [{"label": "PERSON", "pattern": [{"LOWER": f"{name}"}], "id": f"{name}"} for name in identified_characters if len(name.split(' ')) == 1]

list_full_names = [{"label": "PERSON", "pattern": [{"LOWER": f"{name.split(' ')[0]}"}, {"LOWER": f"{name.split(' ')[1]}"}], "id": f"{'-'.join(name.split(' '))}"} for name in identified_characters if len(name.split(' ')) > 1]

all_names = list_names + list_full_names
print('example rule', all_names[0])


example rule {'label': 'PERSON', 'pattern': [{'LOWER': 'angela'}], 'id': 'angela'}


In [93]:
full_nlp = spacy.load("en_core_web_sm")
ruler = full_nlp.add_pipe("entity_ruler")
ruler.add_patterns(all_names)


In [94]:
def get_entities(row, type_entity):
    doc = full_nlp(row.script)
    entities =[]
    # entities = [ent for ent in doc.ents if ent.label_ == type_entity]
    for ent in doc.ents:
        if ent.label_ == type_entity:
            entities.append(ent.text)
    return entities
    
subset_df = df[df.title.str.contains('Twilight')][0:1]

subset_df['improved_identified_entities'] = subset_df.apply(get_entities, axis=1, type_entity='PERSON')

In [99]:
improved_characters = subset_df.improved_identified_entities.values[0]

In [98]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(subset_df.script.values[0])


In [100]:
rows = []

for sent in sentences:
    for char in improved_characters:
        if char in sent.lower():
            rows.append({'script': sent, 'character': char})

In [101]:
characters_sentences_df = pd.DataFrame(rows)

In [102]:
characters_sentences_df.head()

Unnamed: 0,script,character
0,"JACOB BLACK, 16, jogs up, carrying a used car ...",black ponytail
1,"BELLA\r\n God, Jacob, what are th...",rez
2,"BELLA\r\n God, Jacob, what are th...",rez
3,"He faces forward again, jaw tight, mind workin...",jaw tight
4,"(leans closer, grins) ...",grins


In [None]:
# characters_sentences_df.character = characters_sentences_df.character.str.replace("\r", "")
# characters_sentences_df.character = characters_sentences_df.character.str.replace("\n", "")
# characters_sentences_df.character = characters_sentences_df.character.str.strip()
# characters_sentences_df.drop_duplicates(inplace=True)

In [106]:
import itertools

# https://docs.python.org/3/library/itertools.html

In [107]:
char_rows = []
def get_character_network(rows):
    if len(rows.character) > 1:
        chars = rows.character.tolist()
        chars.sort()
        combos = itertools.combinations(chars, 2)
        for c in combos:
            char_rows.append({'source': c[0], 'target': c[1]})
    # return [{'source': row.character, 'target': char} for char in row.characters_in_script]

characters_sentences_df.groupby('script').apply(get_character_network)

In [108]:
char_df = pd.DataFrame(char_rows)
char_network = char_df.groupby(['source','target']).size().reset_index(name='weight')

In [109]:
char_df.to_csv('char_df_network.csv', index=False)