使用Transformer进行抄袭检测
点击下方卡片,关注“新机器视觉”公众号
视觉/图像重磅干货,第一时间送达
来源:磐创AI
动机
问题陈述
分析方法
科学实施
数据预处理
import pandas as pd
def preprocess_data(data_path, sample_size):
# Read the data from specific path
data = pd.read_csv(data_path, low_memory=False)
# Drop articles without Abstract
data = data.dropna(subset = ['abstract']).reset_index(drop = True)
# Get "sample_size" random articles
data = data.sample(sample_size)[['abstract']]
return data
# Read data & preprocess it
data_path = "./data/cord19_source_data.csv"
source_data = preprocess_data(data_path, 100)
文档向量化器
-
create_vector_from_text:用于生成单个文档的向量表示。 -
create_vector_database:负责创建一个数据库,其中包含每个文档的相应向量。
# Useful libraries
import numpy as np
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, AutoModelForSequenceClassification
# Load bert model
model_path = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path,
do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
output_attentions=False,
output_hidden_states=True)
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
input_ids = tokenizer.encode(
text,
add_special_tokens = True,
max_length = MAX_LEN,
)
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
truncating="post", padding="post")
# Remove the outer list.
input_ids = results[0]
# Create attention masks
attention_mask = [int(i>0) for i in input_ids]
# Convert to tensors.
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
# Add an extra dimension for the "batch" (even though there is only one
# input in this batch.)
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
logits, encoded_layers = model(
input_ids = input_ids,
token_type_ids = None,
attention_mask = attention_mask,
return_dict=False)
layer_i = 12 # The last BERT layer before the classifier.
batch_i = 0 # Only one input in the batch.
token_i = 0 # The first token, corresponding to [CLS]
# Extract the vector.
vector = encoded_layers[layer_i][batch_i][token_i]
# Move to the CPU and convert to numpy ndarray.
vector = vector.detach().cpu().numpy()
return(vector)
def create_vector_database(data):
# The list of all the vectors
vectors = []
# Get overall text data
source_data = data.abstract.values
# Loop over all the comment and get the embeddings
for text in tqdm(source_data):
# Get the embedding
vector = create_vector_from_text(tokenizer, model, text)
#add it to the list
vectors.append(vector)
data["vectors"] = vectors
data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))
return data
# Create the vector database
vector_database = create_vector_database(source_data)
vector_database.sample(5)
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def translate_text(text, text_lang, target_lang='en'):
# Get the name of the model
model_name = f"Helsinki-NLP/opus-mt-{text_lang}-{target_lang}"
# Get the tokenizer
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Instantiate the model
model = MarianMTModel.from_pretrained(model_name)
# Translation of the text
formated_text = ">>{}<< {}".format(text_lang, text)
translation = model.generate(**tokenizer([formated_text], return_tensors="pt", padding=True))
translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translation][0]
return translated_text
抄袭分析器
-
similarity_score:传入文章与数据库中最相似的现有文章之间的得分。 -
is_plagiarism:如果相似度得分等于或超过阈值,则值为true。否则为false。 -
most_similar_article:最相似文章的文本信息。 -
article_submitted:提交审批的文章。
def process_document(text):
"""
Create a vector for given text and adjust it for cosine similarity search
"""
text_vect = create_vector_from_text(tokenizer, model, text)
text_vect = np.array(text_vect)
text_vect = text_vect.reshape(1, -1)
return text_vect
def is_plagiarism(similarity_score, plagiarism_threshold):
return similarity_score < plagiarism_threshold
def check_incoming_document(incoming_document):
text_lang = detect(incoming_document)
language_list = ['de', 'fr', 'el', 'ja', 'ru']
final_result = ""
if(text_lang == 'en'):
final_result = incoming_document
elif(text_lang not in language_list):
final_result = None
else:
# Translate in English
final_result = translate_text(incoming_document, text_lang)
return final_result
def run_plagiarism_analysis(query_text, data, plagiarism_threshold=0.8):
top_N=3
# Check the language of the query/incoming text and translate if required.
document_translation = check_incoming_document(query_text)
if(document_translation is None):
print("Only the following languages are supported: English, French, Russian, German, Greek and Japanese")
exit(-1)
else:
# Preprocess the document to get the required vector for similarity analysis
query_vect = process_document(document_translation)
# Run similarity Search
data["similarity"] = data["vectors"].apply(lambda x: cosine_similarity(query_vect, x))
data["similarity"] = data["similarity"].apply(lambda x: x[0][0])
similar_articles = data.sort_values(by='similarity', ascending=False)[1:top_N+1]
formated_result = similar_articles[["abstract", "paper_id", "similarity"]].reset_index(drop = True)
similarity_score = formated_result.iloc[0]["similarity"]
most_similar_article = formated_result.iloc[0]["abstract"]
is_plagiarism_bool = is_plagiarism(similarity_score, plagiarism_threshold)
plagiarism_decision = {'similarity_score': similarity_score,
'is_plagiarism': is_plagiarism_bool,
'most_similar_article': most_similar_article,
'article_submitted': query_text
}
return plagiarism_decision
系统实验
评估
english_article_to_check = "The need for multidisciplinary research to address today's complex health and environmental challenges has never been greater. The One Health (OH) approach to research ensures that human, animal, and environmental health questions are evaluated in an integrated and holistic manner to provide a more comprehensive understanding of the problem and potential solutions than would be possible with siloed approaches. However, the OH approach is complex, and there is limited guidance available for investigators regarding the practical design and implementation of OH research. In this paper we provide a framework to guide researchers through conceptualizing and planning an OH study. We discuss key steps in designing an OH study, including conceptualization of hypotheses and study aims, identification of collaborators for a multi-disciplinary research team, study design options, data sources and collection methods, and analytical methods. We illustrate these concepts through the presentation of a case study of health impacts associated with land application of biosolids. Finally, we discuss opportunities for applying an OH approach to identify solutions to current global health issues, and the need for cross-disciplinary funding sources to foster an OH approach to research."
# Select an existing article from the database
new_incoming_text = source_data.iloc[0]['abstract']
# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, vector_database, plagiarism_threshold=0.8)
french_article_to_check = """Les Réseaux d’Innovation et de Transfert Agricole (RITA) ont été créés en 2011 pour mieux connecter la recherche et le développement agricole, intra et inter-DOM, avec un objectif d’accompagnement de la diversification des productions locales. Le CGAAER a été chargé d'analyser ce dispositif et de proposer des pistes d'action pour améliorer la chaine Recherche – Formation – Innovation – Développement – Transfert dans les outre-mer dans un contexte d'agriculture durable, au profit de l'accroissement de l'autonomie alimentaire."""
analysis_result = run_plagiarism_analysis(french_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result
german_article_to_check = """Derzeit ist eine Reihe strukturell und funktionell unterschiedlicher temperaturempfindlicher Elemente wie RNA-Thermometer bekannt, die eine Vielzahl biologischer Prozesse in Bakterien, einschließlich der Virulenz, steuern. Auf der Grundlage einer Computer- und thermodynamischen Analyse der vollständig sequenzierten Genome von 25 Salmonella enterica-Isolaten wurden ein Algorithmus und Kriterien für die Suche nach potenziellen RNA-Thermometern entwickelt. Er wird es ermöglichen, die Suche nach potentiellen Riboschaltern im Genom anderer gesellschaftlich wichtiger Krankheitserreger durchzuführen. Für S. enterica wurden neben dem bekannten 4U-RNA-Thermometer vier Hairpin-Loop-Strukturen identifiziert, die wahrscheinlich als weitere RNA-Thermometer fungieren. Sie erfüllen die notwendigen und hinreichenden Bedingungen für die Bildung von RNA-Thermometern und sind hochkonservative nichtkanonische Strukturen, da diese hochkonservativen Strukturen im Genom aller 25 Isolate von S. enterica gefunden wurden. Die Hairpins, die eine kreuzförmige Struktur in der supergewickelten pUC8-DNA bilden, wurden mit Hilfe der Rasterkraftmikroskopie sichtbar gemacht."""
analysis_result = run_plagiarism_analysis(german_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result
结论
声明:部分内容来源于网络,仅供读者学习、交流之目的。文章版权归原作者所有。如有不妥,请联系删除。
评论