## Write an SVM classifier to take embeddings, and use them to predict the label # Get the data import OstreaCultura as OC dat = CSV.read("data/Climate Misinformation Library with counterclaims.csv", DataFrame) ## Stack claims and counter claims into a single column, label as 1 for claim, 0 for counter claim dat = stack(select(dat, r"laims"), [:Claims, :Counterclaims], variable_name=:Type, value_name=:text) dropmissing!(dat) dat.label = ifelse.(dat.Type .== "Claims", 1, 0) ## Embeddings of text model = "multilingual-e5-large" embeds = OC.multi_embeddings(dat) #Features features = convert(Array, embeds.Embeddings) y = convert(Array, dat.label) # Generate resnet svm (resvm) @sk_import calibration: CalibratedClassifierCV import ScikitLearn: CrossValidation @sk_import svm: LinearSVC import ScikitLearn: CrossValidation using ScikitLearn.CrossValidation: cross_val_score resvm = LinearSVC(C=.5, loss="squared_hinge", penalty="l2", multi_class="ovr", random_state = 35552, max_iter=2000) cv = ScikitLearn.CrossValidation.KFold(189, n_folds=5, random_state = 134, shuffle=true) out = cross_val_score(resvm, features, y, cv = cv) ## get precision and recall using ScikitLearn: metrics y_pred = ScikitLearn.CrossValidation.cross_val_predict(resvm, features, y, cv=cv) ## roll your own precision pre = sum((y .== 1) .& (y_pred .== 1)) / sum(y_pred .== 1) ## roll your own recall rec = sum((y .== 1) .& (y_pred .== 1)) / sum(y .== 1) ## pull out the support vector embeddings fit!(resvm, features, y) # sv = resvm.coef_ histogram(sv[1, :]) calsvm = CalibratedClassifierCV(resvm) calsvm.fit(features, y) prob_preds = calsvm.predict_proba(features) ## Get indices of top highest probabilities in the first column top_k = 5 top_indices = sortperm(prob_preds[:, 1], rev=true)[1:top_k] prob_preds[top_indices, :] dat[top_indices, :] ## Get the indices of the top k highest probabilities in second column top_indices = sortperm(prob_preds[:, 2], rev=true)[1:top_k] prob_preds[top_indices, :] dat[top_indices, :]