# -*- coding: utf-8 -*-
import os
import argparse
import random
import spacy
import re
import codecs
from random import randint
from nltk.corpus import wordnet as wn
nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en')
from random import randint
import constants
def readlines(fname):
lines = []
with open(fname, encoding="utf-8", errors="ignore") as rf:
for line in rf:
line = line.strip()
if line:
lines.append(line)
return lines
class PrepHandler:
def __init__(self):
# option 1exit
self.captions = {
"coco": readlines('old.en'),
"real": readlines('new.en')
}
# option 2
self.captions_coco = readlines('old.en')
self.captions_real = readlines('new.en')
def match(self, orig_text, pattern_text):
for i_ in range(len(orig_text)):
if orig_text[i_] != pattern_text[i_]:
return False
return True
def is_ascii_word(self,word):
for k_ in range(len(word)):
if word[k_] != ' ' and not ('a' <= word[k_] <= 'z'):
return False
return True
def DetectionOfNounChunks(self, text_info):
noun_chunks_ = list(text_info.noun_chunks)
r_ = [[False for _ in range(len(noun_chunks_))] for __ in range(len(noun_chunks_))]
for j_ in range(len(noun_chunks_)):
for k_ in range(j_+1, len(noun_chunks_)):
if noun_chunks_[j_].end + 1 < noun_chunks_[k_].start:
continue
if (noun_chunks_[j_].end == noun_chunks_[k_].start) or (
text_info[noun_chunks_[j_].end].text in [', ', 'with', 'and', 'or']):
r_[j_][k_] = True
r_[k_][j_] = True
# compute transitive closure
for p_ in range(len(noun_chunks_)):
for j_ in range(len(noun_chunks_)):
for k_ in range(len(noun_chunks_)):
if r_[j_][p_] and r_[p_][k_]:
r_[j_][k_] = True
return noun_chunks_, r_
def generate_prep_adversaries(self,captions,seed =1):
prep_substitute_dict = dict()
captions = self.captions_real
for subset in constants.prep_set:
for word in subset:
if word in prep_substitute_dict:
continue
prep_substitute_dict[word] = set()
for sub in constants.prep_set:
if word in sub:
continue
for w in sub:
prep_substitute_dict[word].add(w)
for p, caption in enumerate(captions):
print("p:", p)
substitution_cnt= 0
text = nlp(caption.lower())
noun_chunks, no_swap = DetectionOfNounChunks(text)
output = []
new_word_list = [item.text for item in text]
for q, word in enumerate(text):
# print(word)
if word.pos_ == 'ADP' and (word.text in prep_substitute_dict):
for substituted_word in prep_substitute_dict[word.text]:
new_word_list[q] = substituted_word
temp = ' '.join(new_word_list)
new_word_list[q] = word.text
substitution_cnt+= 1
output.append(temp)
if output:
print(random.choice(output))
return output