Let’s add the libraries where they are really needed, not all of them at the first line
import pandas as pd
our data frames
train_df = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_df = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
sample_df = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv')
Initiative knowledge about our data
train_df
id | premise | hypothesis | lang_abv | language | label | |
---|---|---|---|---|---|---|
0 | 5130fd2cb5 | and these comments were considered in formulat... | The rules developed in the interim were put to... | en | English | 0 |
1 | 5b72532a0b | These are issues that we wrestle with in pract... | Practice groups are not permitted to work on t... | en | English | 2 |
2 | 3931fbe82a | Des petites choses comme celles-là font une di... | J'essayais d'accomplir quelque chose. | fr | French | 0 |
3 | 5622f0c60b | you know they can't really defend themselves l... | They can't defend themselves because of their ... | en | English | 0 |
4 | 86aaa48b45 | ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด... | เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร | th | Thai | 1 |
... | ... | ... | ... | ... | ... | ... |
12115 | 2b78e2a914 | The results of even the most well designed epi... | All studies have the same amount of uncertaint... | en | English | 2 |
12116 | 7e9943d152 | But there are two kinds of the pleasure of do... | But there are two kinds of the pleasure of doi... | en | English | 0 |
12117 | 5085923e6c | The important thing is to realize that it's wa... | It cannot be moved, now or ever. | en | English | 2 |
12118 | fc8e2fd1fe | At the west end is a detailed model of the who... | The model temple complex is at the east end. | en | English | 2 |
12119 | 44301dfb14 | For himself he chose Atat??rk, or Father of th... | Ataturk was the father of the Turkish nation. | en | English | 0 |
12120 rows × 6 columns
train_df.isna().sum()
id 0
premise 0
hypothesis 0
lang_abv 0
language 0
label 0
dtype: int64
train_df['language'].value_counts()
English 6870
Chinese 411
Arabic 401
French 390
Swahili 385
Urdu 381
Vietnamese 379
Russian 376
Hindi 374
Greek 372
Thai 371
Spanish 366
Turkish 351
German 351
Bulgarian 342
Name: language, dtype: int64
train_df['label'].value_counts()
0 4176
2 4064
1 3880
Name: label, dtype: int64
test_df
id | premise | hypothesis | lang_abv | language | |
---|---|---|---|---|---|
0 | c6d58c3f69 | بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولم... | کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی... | ur | Urdu |
1 | cefcc82292 | هذا هو ما تم نصحنا به. | عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت ال... | ar | Arabic |
2 | e98005252c | et cela est en grande partie dû au fait que le... | Les mères se droguent. | fr | French |
3 | 58518c10ba | 与城市及其他公民及社区组织代表就IMA的艺术发展进行对话& | IMA与其他组织合作,因为它们都依靠共享资金。 | zh | Chinese |
4 | c32b0d16df | Она все еще была там. | Мы думали, что она ушла, однако, она осталась. | ru | Russian |
... | ... | ... | ... | ... | ... |
5190 | 5f90dd59b0 | نیند نے وعدہ کیا کہ موٹل نے سوال میں تحقیق کی. | نیمیتھ کو موٹل کی تفتیش کے لئے معاوضہ دیا جارہ... | ur | Urdu |
5191 | f357a04e86 | The rock has a soft texture and can be bough... | The rock is harder than most types of rock. | en | English |
5192 | 1f0ea92118 | 她目前的存在,并考虑到他与沃佛斯顿争执的本质,那是尴尬的。 | 她在与Wolverstone的打斗结束后才在场的事实被看作是很尴尬的。 | zh | Chinese |
5193 | 0407b48afb | isn't it i can remember i've only been here ei... | I could see downtown Dallas from where I lived... | en | English |
5194 | 16c2f2ab89 | In Hong Kong you can have a plate, or even a w... | It's impossible to have a plate hand-painted t... | en | English |
5195 rows × 5 columns
test_df['language'].value_counts()
English 2945
Spanish 175
Swahili 172
Russian 172
Urdu 168
Greek 168
Turkish 167
Thai 164
Arabic 159
French 157
German 152
Chinese 151
Hindi 150
Bulgarian 150
Vietnamese 145
Name: language, dtype: int64
sample_df
id | prediction | |
---|---|---|
0 | c6d58c3f69 | 1 |
1 | cefcc82292 | 1 |
2 | e98005252c | 1 |
3 | 58518c10ba | 1 |
4 | c32b0d16df | 1 |
... | ... | ... |
5190 | 5f90dd59b0 | 1 |
5191 | f357a04e86 | 1 |
5192 | 1f0ea92118 | 1 |
5193 | 0407b48afb | 1 |
5194 | 16c2f2ab89 | 1 |
5195 rows × 2 columns
Modeling
import tensorflow as tf
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
strategy = tf.distribute.get_strategy()
from transformers import TFAutoModel, AutoTokenizer
[34m[1mwandb[0m: [33mWARNING[0m W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
def model_watson(strategy,transformer):
with strategy.scope():
transformer_encoder = TFAutoModel.from_pretrained(transformer)
input_layer = Input(shape=(100,), dtype=tf.int32, name="input_layer")
sequence_output = transformer_encoder(input_layer)[0]
cls_token = sequence_output[:, 0, :]
output_layer = Dense(3, activation='softmax')(cls_token)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
return model
model = model_watson(strategy,"distilbert-base-multilingual-cased")
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…
HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
train_data = train_df[['premise', 'hypothesis']].values.tolist()
test_data = test_df[['premise', 'hypothesis']].values.tolist()
train_encoded=tokenizer.batch_encode_plus(train_data,pad_to_max_length=True,max_length=100)
test_encoded=tokenizer.batch_encode_plus(test_data,pad_to_max_length=True,max_length=100)
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_encoded['input_ids'], train_df.label.values, test_size=0.2)
x_test = test_encoded['input_ids']
train_dataset = (tf.data.Dataset.from_tensor_slices((x_train, y_train)).repeat().shuffle(2048).batch(20 * strategy.num_replicas_in_sync).prefetch(tf.data.experimental.AUTOTUNE))
valid_dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid)).batch(20 * strategy.num_replicas_in_sync).cache().prefetch(tf.data.experimental.AUTOTUNE))
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(20 * strategy.num_replicas_in_sync))
model.summary()
history = model.fit(train_dataset,steps_per_epoch=len(train_df) // 20 * strategy.num_replicas_in_sync,validation_data=valid_dataset,epochs= 5)
Our prediction output
predictions = model.predict(test_dataset, verbose=1)
sample_df['prediction'] = predictions.argmax(axis=1)
import os
os.chdir(r'/kaggle/working')
sample_df.to_csv(r'submission.csv',index= False)
sample_df.head(10)