-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathperplexity.py
59 lines (43 loc) · 2.41 KB
/
perplexity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import streamlit as st
from transformers import XLNetTokenizer, XLNetLMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.probability import FreqDist
from collections import Counter
from nltk.corpus import stopwords
import string
import numpy as np
# Load XLNet tokenizer and model
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
def calculate_perplexity(text):
encoded_input = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
input_ids = encoded_input[0]
# input_ids = encoded_input[0].unsqueeze(0)
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
perplexity = torch.exp(torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1)))
return perplexity.item()
def generate_sample_data():
# Generate sample data including both human-generated and AI-generated text
human_text = ["we are a group of 5 engineering students, we aim to create an ai generated text detection project, that accurately marks the difference between human and GPT-generated content. This will help the audience know the origin of content, thus increase the transparency between the users."]
ai_text = ["As a team of five engineering students, we want to develop an artificial intelligence project for text identification that can distinguish between content produced by GPT and that created by humans. This will improve user transparency by assisting the audience in understanding the source of the content."]
return human_text, ai_text
def calculate_threshold(human_text, ai_text):
# Calculate perplexity for human-generated text
human_perplexities = [calculate_perplexity(text) for text in human_text]
# Calculate perplexity for AI-generated text
ai_perplexities = [calculate_perplexity(text) for text in ai_text]
# Calculate the threshold as the mean perplexity plus some standard deviation
threshold = np.mean(human_perplexities) + np.std(human_perplexities)
return threshold
# Generate sample data
human_text, ai_text = generate_sample_data()
# Calculate threshold
threshold = calculate_threshold(human_text, ai_text)
# Display the threshold
st.write("Perplexity Threshold:", threshold)