使用Transformer进行抄袭检测
新机器视觉
共 17516字,需浏览 36分钟
· 2023-09-21
点击下方卡片,关注“新机器视觉”公众号
视觉/图像重磅干货,第一时间送达
来源:磐创AI
动机
问题陈述
分析方法
科学实施
数据预处理
import pandas as pd
def preprocess_data(data_path, sample_size):
# Read the data from specific path
data = pd.read_csv(data_path, low_memory=False)
# Drop articles without Abstract
data = data.dropna(subset = ['abstract']).reset_index(drop = True)
# Get "sample_size" random articles
data = data.sample(sample_size)[['abstract']]
return data
# Read data & preprocess it
data_path = "./data/cord19_source_data.csv"
source_data = preprocess_data(data_path, 100)
文档向量化器
-
create_vector_from_text:用于生成单个文档的向量表示。 -
create_vector_database:负责创建一个数据库,其中包含每个文档的相应向量。
# Useful libraries
import numpy as np
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, AutoModelForSequenceClassification
# Load bert model
model_path = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path,
do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
output_attentions=False,
output_hidden_states=True)
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
input_ids = tokenizer.encode(
text,
add_special_tokens = True,
max_length = MAX_LEN,
)
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
truncating="post", padding="post")
# Remove the outer list.
input_ids = results[0]
# Create attention masks
attention_mask = [int(i>0) for i in input_ids]
# Convert to tensors.
input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)
# Add an extra dimension for the "batch" (even though there is only one
# input in this batch.)
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.
with torch.no_grad():
logits, encoded_layers = model(
input_ids = input_ids,
token_type_ids = None,
attention_mask = attention_mask,
return_dict=False)
layer_i = 12 # The last BERT layer before the classifier.
batch_i = 0 # Only one input in the batch.
token_i = 0 # The first token, corresponding to [CLS]
# Extract the vector.
vector = encoded_layers[layer_i][batch_i][token_i]
# Move to the CPU and convert to numpy ndarray.
vector = vector.detach().cpu().numpy()
return(vector)
def create_vector_database(data):
# The list of all the vectors
vectors = []
# Get overall text data
source_data = data.abstract.values
# Loop over all the comment and get the embeddings
for text in tqdm(source_data):
# Get the embedding
vector = create_vector_from_text(tokenizer, model, text)
#add it to the list
vectors.append(vector)
data["vectors"] = vectors
data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
data["vectors"] = data["vectors"].apply(lambda emb: emb.reshape(1, -1))
return data
# Create the vector database
vector_database = create_vector_database(source_data)
vector_database.sample(5)
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def translate_text(text, text_lang, target_lang='en'):
# Get the name of the model
model_name = f"Helsinki-NLP/opus-mt-{text_lang}-{target_lang}"
# Get the tokenizer
tokenizer = MarianTokenizer.from_pretrained(model_name)
# Instantiate the model
model = MarianMTModel.from_pretrained(model_name)
# Translation of the text
formated_text = ">>{}<< {}".format(text_lang, text)
translation = model.generate(**tokenizer([formated_text], return_tensors="pt", padding=True))
translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translation][0]
return translated_text
抄袭分析器
-
similarity_score:传入文章与数据库中最相似的现有文章之间的得分。 -
is_plagiarism:如果相似度得分等于或超过阈值,则值为true。否则为false。 -
most_similar_article:最相似文章的文本信息。 -
article_submitted:提交审批的文章。
def process_document(text):
"""
Create a vector for given text and adjust it for cosine similarity search
"""
text_vect = create_vector_from_text(tokenizer, model, text)
text_vect = np.array(text_vect)
text_vect = text_vect.reshape(1, -1)
return text_vect
def is_plagiarism(similarity_score, plagiarism_threshold):
return similarity_score < plagiarism_threshold
def check_incoming_document(incoming_document):
text_lang = detect(incoming_document)
language_list = ['de', 'fr', 'el', 'ja', 'ru']
final_result = ""
if(text_lang == 'en'):
final_result = incoming_document
elif(text_lang not in language_list):
final_result = None
else:
# Translate in English
final_result = translate_text(incoming_document, text_lang)
return final_result
def run_plagiarism_analysis(query_text, data, plagiarism_threshold=0.8):
top_N=3
# Check the language of the query/incoming text and translate if required.
document_translation = check_incoming_document(query_text)
if(document_translation is None):
print("Only the following languages are supported: English, French, Russian, German, Greek and Japanese")
exit(-1)
else:
# Preprocess the document to get the required vector for similarity analysis
query_vect = process_document(document_translation)
# Run similarity Search
data["similarity"] = data["vectors"].apply(lambda x: cosine_similarity(query_vect, x))
data["similarity"] = data["similarity"].apply(lambda x: x[0][0])
similar_articles = data.sort_values(by='similarity', ascending=False)[1:top_N+1]
formated_result = similar_articles[["abstract", "paper_id", "similarity"]].reset_index(drop = True)
similarity_score = formated_result.iloc[0]["similarity"]
most_similar_article = formated_result.iloc[0]["abstract"]
is_plagiarism_bool = is_plagiarism(similarity_score, plagiarism_threshold)
plagiarism_decision = {'similarity_score': similarity_score,
'is_plagiarism': is_plagiarism_bool,
'most_similar_article': most_similar_article,
'article_submitted': query_text
}
return plagiarism_decision
系统实验
评估
english_article_to_check = "The need for multidisciplinary research to address today's complex health and environmental challenges has never been greater. The One Health (OH) approach to research ensures that human, animal, and environmental health questions are evaluated in an integrated and holistic manner to provide a more comprehensive understanding of the problem and potential solutions than would be possible with siloed approaches. However, the OH approach is complex, and there is limited guidance available for investigators regarding the practical design and implementation of OH research. In this paper we provide a framework to guide researchers through conceptualizing and planning an OH study. We discuss key steps in designing an OH study, including conceptualization of hypotheses and study aims, identification of collaborators for a multi-disciplinary research team, study design options, data sources and collection methods, and analytical methods. We illustrate these concepts through the presentation of a case study of health impacts associated with land application of biosolids. Finally, we discuss opportunities for applying an OH approach to identify solutions to current global health issues, and the need for cross-disciplinary funding sources to foster an OH approach to research."
# Select an existing article from the database
new_incoming_text = source_data.iloc[0]['abstract']
# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, vector_database, plagiarism_threshold=0.8)
french_article_to_check = """Les Réseaux d’Innovation et de Transfert Agricole (RITA) ont été créés en 2011 pour mieux connecter la recherche et le développement agricole, intra et inter-DOM, avec un objectif d’accompagnement de la diversification des productions locales. Le CGAAER a été chargé d'analyser ce dispositif et de proposer des pistes d'action pour améliorer la chaine Recherche – Formation – Innovation – Développement – Transfert dans les outre-mer dans un contexte d'agriculture durable, au profit de l'accroissement de l'autonomie alimentaire."""
analysis_result = run_plagiarism_analysis(french_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result
german_article_to_check = """Derzeit ist eine Reihe strukturell und funktionell unterschiedlicher temperaturempfindlicher Elemente wie RNA-Thermometer bekannt, die eine Vielzahl biologischer Prozesse in Bakterien, einschließlich der Virulenz, steuern. Auf der Grundlage einer Computer- und thermodynamischen Analyse der vollständig sequenzierten Genome von 25 Salmonella enterica-Isolaten wurden ein Algorithmus und Kriterien für die Suche nach potenziellen RNA-Thermometern entwickelt. Er wird es ermöglichen, die Suche nach potentiellen Riboschaltern im Genom anderer gesellschaftlich wichtiger Krankheitserreger durchzuführen. Für S. enterica wurden neben dem bekannten 4U-RNA-Thermometer vier Hairpin-Loop-Strukturen identifiziert, die wahrscheinlich als weitere RNA-Thermometer fungieren. Sie erfüllen die notwendigen und hinreichenden Bedingungen für die Bildung von RNA-Thermometern und sind hochkonservative nichtkanonische Strukturen, da diese hochkonservativen Strukturen im Genom aller 25 Isolate von S. enterica gefunden wurden. Die Hairpins, die eine kreuzförmige Struktur in der supergewickelten pUC8-DNA bilden, wurden mit Hilfe der Rasterkraftmikroskopie sichtbar gemacht."""
analysis_result = run_plagiarism_analysis(german_article_to_check, vector_database, plagiarism_threshold=0.8)
analysis_result
结论
声明:部分内容来源于网络,仅供读者学习、交流之目的。文章版权归原作者所有。如有不妥,请联系删除。
评论
GPT的风也吹到了CV,详解自回归视觉模型的先驱! ImageGPT:使用图像序列训练图像 GPT模型
作者丨科技猛兽编辑丨极市平台导读 在 CIFAR-10 上,iGPT 使用 linear probing 实现了 96.3% 的精度,优于有监督的 Wide ResNet,并通过完全微调实现了 99.0% 的精度,匹配顶级监督预训练模型。本文目录1 自回归视觉模型的先驱 ImageGPT:
机器学习初学者
0
深入浅出各种边缘检测算子及其推导
作者丨Rustle@知乎(已授权)来源丨https://zhuanlan.zhihu.com/p/59640437编辑丨极市平台导读 本文系统的讲解了边缘检测算法的相关概念,并辅以大量的图与公式帮助大家深入理解各种边缘检测算子。 写在前面:本文篇幅较长,用了大量图与公式帮助大家深
机器学习初学者
0
面试官:在原生input上面使用v-model和组件上面使用有什么区别?
前言面试官:vue3的v-model都用过吧,来讲讲。粉丝:v-model其实就是一个语法糖,在编译时v-model会被编译成:modelValue属性和@update:modelValue事件。一般在子组件中定义一个名为modelValue的props来接收父组件v-model传递的值,然后当子组
高级前端进阶
0
使用 GitHub Actions 构建 Golang PGO
今年 2 月,我宣布 Dolt 版本现已构建为配置文件引导优化 (pgo) 二进制文件,利用 Golang 1.20 的强大功能将 Dolt 的读取性能提高 5.3%。在我宣布这一消息之前,我们的一位常驻 Golang 专家 Zach 试验并测试了 Golang 的 pgo 功能
GoCN
0
盘点一个使用超级鹰识别验证码并自动登录的案例
点击上方“Python共享之家”,进行关注回复“资源”即可获赠Python学习资料今日鸡汤江上几人在,天涯孤棹还。大家好,我是皮皮。一、前言前几天在Python钻石交流群【静惜】问了一个Python实现识别验证码并自动登录的问题,提问截图如下:验证码的截图如下所示:二、实现过程这里大家激烈的探讨,【
IT共享之家
0
Langchain使用 | 模型、提示和解析器、存储
零、LangChain介绍为各种不同基础模型提供统一接口- 帮助管理提示的框架- 一套中心化接口,用于处理长期记忆(参见Memory)、外部数据(参见Indexes)、其他 LLM(参见Chains)以及 LLM 无法处理的任务的其他代理(例如,计算或搜索)。总的来说,有六大核心模块:Models:
Python之王
0
Stability AI开放Stable Diffusion 3 API,在线免费使用
「Stability AI」宣布开放其最新文本到图像生成模型「Stable Diffusion 3」的API接口,供开发者和企业使用。该模型采用创新的多模态扩散转换器架构,在字体、细节还原、提示理解等方面表现优异,评测结果超越了业内其他顶尖系统。与DALL-E 3和Midjourney v6等最先进
IQ前端
0
图解 transformer 中的自注意力机制
↓推荐关注↓本文将将介绍注意力的概念从何而来,它是如何工作的以及它的简单的实现。注意力机制在整个注意力过程中,模型会学习了三个权重:查询、键和值。查询、键和值的思想来源于信息检索系统。所以我们先理解数据库查询的思想。假设有一个数据库,里面有所有一些作家和他们的书籍信息。现在我想读一些Rabindra
Python学习与数据挖掘
0