-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
96 lines (73 loc) · 3.81 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
import nltk
from nltk.probability import FreqDist
import plotly.express as px
from collections import Counter
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
# model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
def calculate_perplexity(text):
encoded_input = tokenizer.encode(text, add_special_tokens=False, return_tensors='pt')
# input_ids = encoded_input[0].unsqueeze(0)
input_ids = encoded_input[0]
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits
perplexity = torch.exp(torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1)))
return perplexity.item()
def calculate_burstiness(text):
tokens = nltk.word_tokenize(text.lower())
word_freq = FreqDist(tokens)
repeated_count = sum(count > 1 for count in word_freq.values())
burstiness_score = repeated_count / len(tokens)
return burstiness_score
def plot_top_repeated_words(text):
# Tokenize the text and remove stopwords and special characters
tokens = text.split()
stop_words = set(stopwords.words('english'))
tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]
# Count the occurrence of each word
word_counts = Counter(tokens)
# Get the top 10 most repeated words
top_words = word_counts.most_common(10)
# Extract the words and their counts for plotting
words = [word for word, count in top_words]
counts = [count for word, count in top_words]
# Plot the bar chart using Plotly
fig = px.bar(x=words, y=counts, labels={'x': 'Words', 'y': 'Counts'}, title='Top 10 Most Repeated Words')
st.plotly_chart(fig, use_container_width=True)
st.set_page_config(layout="wide")
st.title("GPT Shield: AI Plagiarism Detector")
text_area = st.text_area("Enter text", "")
if text_area is not None:
if st.button("Analyze"):
col1, col2, col3 = st.columns([1,1,1])
with col1:
st.info("Your Input Text")
st.success(text_area)
with col2:
st.info("Detection Score")
perplexity = calculate_perplexity(text_area)
burstiness_score = calculate_burstiness(text_area)
result=is_generated_by_ai(text_area)
st.write("Perplexity:", perplexity)
st.write("Burstiness Score:", burstiness_score)
# if perplexity > 7524.77197265625 and burstiness_score < 0.2 and result:
if perplexity > 7524.77197265625 and burstiness_score < 0.2 :
# if perplexity > 1.5722020864486694 and burstiness_score < 0.2:
st.error("Text Analysis Result: AI generated content")
else:
st.success("Text Analysis Result: Likely not generated by AI")
st.warning("Disclaimer: AI plagiarism detector apps can assist in identifying potential instances of plagiarism; however, it is important to note that their results may not be entirely flawless or completely reliable. These tools employ advanced algorithms, but they can still produce false positives or false negatives. Therefore, it is recommended to use AI plagiarism detectors as a supplementary tool alongside human judgment and manual verification for accurate and comprehensive plagiarism detection.")
with col3:
st.info("Basic Details")
plot_top_repeated_words(text_area)