# The file "airlinetweets.csv" contains sentiments (positive, neutral and negative) of the tweets for different airline companies. You have to learn a model using SVM classifier that is able to classify the tweets into one of the three classes.¶

### Importing libraries¶

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.svm import SVC
import re


### Code to clean the tweets¶

In [6]:
def cleaner(impure_data):
temp_list = []
for item in impure_data:
item = re.sub('@\S+', '', item)

item = re.sub('http\S+\s*', '', item)

#finding special characters, but not "emoji"
item = re.sub('[%s]' % re.escape("""!"#\$%&'()*+,-./:;<=>?@[\]^_{|}~"""), '', item)
temp_list.append(item)
return temp_list


### Function to predict tweet sentiment¶

The predicted tweets are stored in "predicted_airlinetweets.csv" file. Classification of the tweets has been done using SVM. Note that it is a multi-class classification task.

In [21]:
def tweet_sentiment():
#reading the tweets from csv files
tweets = df["text"]
polarity = df["airline_sentiment"].tolist()

#cleaning tweets i.e. removing @mentions, http(s) links and special characters such as punctuations
clean_tweet = cleaner(tweets)

#initializing tf-idf vectorizer
tf_idfvectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)

#splitting the data into train and test dataset in 70 : 30 ratio at random
X_train, X_test, Y_train, Y_test = train_test_split(clean_tweet, polarity, test_size = 0.3)

#vectorizing the training data
#fit_transform() does two jobs, fit() and transform()
#fit calculates the statistics of the data
#transform takes care of any missing values or unexpected values by utilizing statistics calculated by fit
train_corpus_tf_idf = tf_idfvectorizer.fit_transform(X_train)

#vectorizing the testing data
#transform takes care of any missing values or unexpected values based on fit for training data
test_corpus_tf_idf = tf_idfvectorizer.transform(X_test)

#using SVC package to initialize a classifier with Linear kernel and other default parameters
SVM_L = SVC(kernel= 'linear')

#fitting the sparse matrix in the classifier with their respective sentiments
SVM_L.fit(train_corpus_tf_idf, Y_train)

#predicting the sentiments for the test dataset
Y_pred = SVM_L.predict(test_corpus_tf_idf)

#this print accuracy score for the test dataset
print("Accuracy",accuracy_score(Y_test,Y_pred))

#saving the data into a csv file in the current folder
temp_df = pd.DataFrame()
temp_df["Tweet"] = X_test
temp_df["Sentiment"] = Y_test
temp_df["Predicted Sentiment"] = Y_pred
temp_df.to_csv("predicted_airlinetweets.csv")

return(tf_idfvectorizer,SVM_L)


### Function is called: The predicted sentiments are stored in a file and the accuracy is printed¶

In [22]:
vectorizer,model = tweet_sentiment()

Accuracy 0.8021402550091075


### Compute the sentiment of a tweet and print the sentiment¶

In [46]:
vector = vectorizer.transform(["My journey was good. Thanks to your customer service."])
sentiment = model.predict(vector)
print(sentiment)

['positive']

In [47]:
vector = vectorizer.transform(["My journey was horrible because of your customer service."])
sentiment = model.predict(vector)
print(sentiment)

['negative']

In [48]:
vector = vectorizer.transform(["My journey was not good because of your crew."])
sentiment = model.predict(vector)
print(sentiment)

['negative']
`