# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import seaborn as sns
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pycountry_convert as pc
from wordcloud import WordCloud
from textblob import TextBlob
from geopy.geocoders import Nominatim
import folium
from folium.plugins import MarkerCluster
import pygal
from IPython.display import SVG, display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('cia_trafficking.csv')
df.head(3)


def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return float("NaN")

df['continent'] = df['country'].apply(country_to_continent)
df.head(3)


df[df.isna().any(axis=1)]


df.loc[df['country'] == 'Burma', 'country'] = 'Myanmar'
df.loc[df['country'] == 'Congo, Democratic Republic of the', 'country'] = 'Democratic Republic of the Congo'
df.loc[df['country'] == 'Congo, Republic of the', 'country'] = 'Republic of the Congo'
df.loc[df['country'] == 'Gambia, The', 'country'] = 'The Gambia'
df.loc[df['country'] == 'Korea, North', 'country'] = 'North Korea'
df = df[df.country != 'World']
df['continent'] = df['country'].apply(country_to_continent)
df[df.isna().any(axis=1)]


df.loc[df['country'] == 'The Gambia', 'continent'] = 'Africa'
df.loc[df['country'] == 'Timor-Leste', 'continent'] = 'Asia'
df[df.isna().any(axis=1)]


geolocator = Nominatim(user_agent="myapplication") 
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan
df['coordinates'] = df['country'].apply(geolocate)
df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)
df.head(3)


df[df.isnull().any(axis=1)]
df.continent.unique()

array(['Africa', 'North America', 'Europe', 'South America', 'Asia',
       'Oceania'], dtype=object)


ax = sns.countplot(x="continent", data=df)


def country_to_code(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        return country_alpha2
    except:
        return float("NaN")

df['code'] = df['country'].apply(country_to_code)
df.head(3)


df[df.isnull().any(axis=1)]


df.loc[df['country'] == 'The Gambia', 'code'] = 'GM'
df[df.isnull().any(axis=1)]


df['text'] = df['text'].str.replace('current situation: ', '')
df.head(3)


def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    text = ''.join([i for i in text if not i.isdigit()])
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return " ".join(str(x) for x in keywords)

df['text'] = df['text'].apply(preprocess_text)
df.head(3)


tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(df['text'])
feature_names = tfIdfVectorizer.get_feature_names()
tfIdf_df = pd.DataFrame(tfIdf[0].T.todense(), index=feature_names, columns=["TF-IDF"])
tfIdf_df = tfIdf_df.sort_values('TF-IDF', ascending=False)
tfIdf_df.head()


dist = 1 - cosine_similarity(tfIdf)
dist

array([[ 0.00000000e+00,  8.69501828e-01,  8.94652139e-01, ...,
         8.31762669e-01,  8.42474686e-01,  8.36378215e-01],
       [ 8.69501828e-01, -6.66133815e-16,  8.98853440e-01, ...,
         8.72956588e-01,  8.96750877e-01,  8.86675427e-01],
       [ 8.94652139e-01,  8.98853440e-01, -2.22044605e-16, ...,
         8.67939405e-01,  8.82415064e-01,  8.99825887e-01],
       ...,
       [ 8.31762669e-01,  8.72956588e-01,  8.67939405e-01, ...,
        -6.66133815e-16,  8.18038027e-01,  7.76779197e-01],
       [ 8.42474686e-01,  8.96750877e-01,  8.82415064e-01, ...,
         8.18038027e-01, -2.22044605e-16,  8.28938643e-01],
       [ 8.36378215e-01,  8.86675427e-01,  8.99825887e-01, ...,
         7.76779197e-01,  8.28938643e-01,  0.00000000e+00]])


wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(tfIdf_df.sum(axis=1))
plt.figure(figsize=(10,8), dpi= 100)
plt.axis("off")
plt.imshow(wordcloud, interpolation='bilinear')

<matplotlib.image.AxesImage at 0x2129ee9f880>


def polarity(x):
    return TextBlob(x).sentiment[0]

def subjectivity(x):
    return TextBlob(x).sentiment[1]

df['polarity'] = df['text'].apply(polarity)
df['subjectivity'] = df['text'].apply(subjectivity)

df.head(3)


sns.barplot(x = "continent", y = "polarity", data = df)
plt.show()


sns.barplot(x = "continent", y = "subjectivity", data = df)
plt.show()


def display_svg(svg_code):
 return SVG(svg_code)

polarity_dict = dict(zip(df.code.str.lower(), df.polarity))

# create a world map
worldmap1 =  pygal.maps.world.World()

# set the title of the map
worldmap1.title = 'CIA Human Trafficking Polarity'

worldmap1.add('Polarity', polarity_dict)
display_svg(worldmap1.render())


world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)

for i in range(len(df)):
        lat = df.iloc[i]['latitude']
        long = df.iloc[i]['longitude']
        radius=5
        popup_text = """Country : {}<br>
                    Polarity : {}<br>"""
        popup_text = popup_text.format(df.iloc[i]['country'],
                                   df.iloc[i]['polarity']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)

world_map


subjectivity_dict = dict(zip(df.code.str.lower(), df.subjectivity))

# create a world map
worldmap2 =  pygal.maps.world.World()

# set the title of the map
worldmap2.title = 'CIA Human Trafficking Subjectivity'

worldmap2.add('Subjectivity', subjectivity_dict)
display_svg(worldmap2.render())


world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)

for i in range(len(df)):
        lat = df.iloc[i]['latitude']
        long = df.iloc[i]['longitude']
        radius=5
        popup_text = """Country : {}<br>
                    Subjectivity : {}<br>"""
        popup_text = popup_text.format(df.iloc[i]['country'],
                                   df.iloc[i]['subjectivity']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)

world_map

	country	text
0	Algeria	current situation: Algeria is a transit and, t...
1	Antigua and Barbuda	current situation: Antigua and Barbuda is a de...
2	Belarus	current situation: Belarus is a source, transi...

	country	text	continent	coordinates	latitude	longitude
0	Algeria	current situation: Algeria is a transit and, t...	Africa	(28.0000272, 2.9999825)	28.000027	2.999983
1	Antigua and Barbuda	current situation: Antigua and Barbuda is a de...	North America	(17.2234721, -61.9554608)	17.223472	-61.955461
2	Belarus	current situation: Belarus is a source, transi...	Europe	(53.4250605, 27.6971358)	53.425061	27.697136

	country	text	continent	coordinates	latitude	longitude	code
0	Algeria	current situation: Algeria is a transit and, t...	Africa	(28.0000272, 2.9999825)	28.000027	2.999983	DZ
1	Antigua and Barbuda	current situation: Antigua and Barbuda is a de...	North America	(17.2234721, -61.9554608)	17.223472	-61.955461	AG
2	Belarus	current situation: Belarus is a source, transi...	Europe	(53.4250605, 27.6971358)	53.425061	27.697136	BY

	country	text	continent	coordinates	latitude	longitude	code
0	Algeria	Algeria is a transit and, to a lesser extent, ...	Africa	(28.0000272, 2.9999825)	28.000027	2.999983	DZ
1	Antigua and Barbuda	Antigua and Barbuda is a destination and trans...	North America	(17.2234721, -61.9554608)	17.223472	-61.955461	AG
2	Belarus	Belarus is a source, transit, and destination ...	Europe	(53.4250605, 27.6971358)	53.425061	27.697136	BY

	country	text	continent	coordinates	latitude	longitude	code
0	Algeria	algeria transit lesser extent destination sour...	Africa	(28.0000272, 2.9999825)	28.000027	2.999983	DZ
1	Antigua and Barbuda	antigua barbuda destination transit country ad...	North America	(17.2234721, -61.9554608)	17.223472	-61.955461	AG
2	Belarus	belarus source transit destination country wom...	Europe	(53.4250605, 27.6971358)	53.425061	27.697136	BY

Importing Dataset & Libraries¶

Data Pre-processing¶

Natural Language Processing¶

Visualization¶

	country	text	continent
8	Burma	current situation: Burma is a source country f...	NaN
14	Congo, Democratic Republic of the	current situation: The Democratic Republic of ...	NaN
15	Congo, Republic of the	current situation: the Republic of the Congo i...	NaN
23	Gambia, The	current situation: The Gambia is a source and ...	NaN
31	Korea, North	current situation: North Korea is a source cou...	NaN
58	Timor-Leste	current situation: Timor-Leste is a source and...	NaN
65	World	current situation: the International Labour Or...	NaN

	country	text	continent
23	The Gambia	current situation: The Gambia is a source and ...	NaN
58	Timor-Leste	current situation: Timor-Leste is a source and...	NaN

	TF-IDF
algeria	0.451134
sub	0.314730
saharan	0.314730
traffic	0.218949
arrest	0.159665