Ich versuche, das Notizbuch auf Google Multilingual Universal Encoder zu bearbeiten. Allerdings mit meinen eigenen Daten.
#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise
from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange
import json
def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
plot_title,
plot_width=1200, plot_height=600,
xaxis_font_size='12pt', yaxis_font_size='12pt'):
print("embeddings_1: " + str(len(embeddings_1)) + ", labels_1: " + str(len(labels_1)))
assert len(embeddings_1) == len(labels_1)
assert len(embeddings_2) == len(labels_2)
# arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
sim = 1 - np.arccos(
sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
embeddings_2))/np.pi
embeddings_1_col, embeddings_2_col, sim_col = [], [], []
for i in range(len(embeddings_1)):
for j in range(len(embeddings_2)):
embeddings_1_col.append(labels_1[i])
embeddings_2_col.append(labels_2[j])
sim_col.append(sim[i][j])
df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
columns=['embeddings_1', 'embeddings_2', 'sim'])
print(df)
def main():
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')
df_rni = pd.read_csv('df.csv')
# The 16-language multilingual module is the default but feel free
# to pick others from the list and compare the results.
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' #@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']
model = hub.load(module_url)
def embed_text(input):
return model(input)
def compute_similarity(references, target):
# I want to create as many rows as there are references and fill them with the results
# arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
for row in target.iterrows():
for reference in references:
sim = 1 - np.arccos(
result = sklearn.metrics.pairwise.cosine_similarity(row,
reference))/np.pi
# place the result in the column "reference"
# get unique job categories and job of people
job_categories = X.S02Q11_Professional_field.unique()
actual_jobs = df_rni.new_professionactuelle.unique()
# turn them to list
job_categories = job_categories.tolist()
actual_jobs = actual_jobs.tolist()
# transform remaining ' to "
job_categories = json.dumps(job_categories)
actual_jobs =json.dumps(actual_jobs)
# create tensors ?
references_result = embed_text(job_categories)
target_result = embed_text(actual_jobs)
# visualize similarity
visualize_similarity(references_result, target_result, job_categories, actual_jobs, "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")
if __name__ == "__main__":
main()
Aber die der Größe Einbettung ist nich die gleiche wie die Größe des label. Dann fällt es mir wieder ein:
(seg_env4) antoi@LAPTOP-UTL8OHHO:/mnt/c/Users/antoi/Documents/Programming/Covent Garden/Segmentation$ python job_similarities.py
2020-02-21 16:58:16.473628: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-02-21 16:58:16.475202: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-02-21 16:58:16.475473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-02-21 16:58:18.563861: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2020-02-21 16:58:18.564174: E tensorflow/stream_executor/cuda/cuda_driver.cc:351] failed call to cuInit: UNKNOWN ERROR (303)
2020-02-21 16:58:18.565574: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-UTL8OHHO): /proc/driver/nvidia/version does not exist
2020-02-21 16:58:18.566227: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-02-21 16:58:18.583608: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2208000000 Hz
2020-02-21 16:58:18.587137: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fffc567b630 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-02-21 16:58:18.587422: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-02-21 16:58:33.700358: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 1301365760 exceeds 10% of system memory.
2020-02-21 16:58:34.009643: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 780819456 exceeds 10% of system memory.
2020-02-21 16:58:42.952592: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 1301365760 exceeds 10% of system memory.
2020-02-21 16:58:44.177061: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 780819456 exceeds 10% of system memory.
2020-02-21 16:58:47.592505: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 1301365760 exceeds 10% of system memory.
embeddings_1: 1, labels_1: 678
Traceback (most recent call last):
File "job_similarities.py", line 89, in <module>
main()
File "job_similarities.py", line 86, in main
visualize_similarity(references_result, target_result, job_categories, actual_jobs, "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")
File "job_similarities.py", line 26, in visualize_similarity
assert len(embeddings_1) == len(labels_1)
AssertionError
In der Tat, in job_categories, die Etiketten, sind:
["-99", "Property and construction", "Trade", "Leisure, sport and tourism", "Accountancy, banking and finance", "Creative arts and design", "Charity and voluntary work", "Sales", "Healthcare", "Engineering and manufacturing", "Social care", "Agriculture, farming and environment", "Law", "Other. Please specify:", "Teacher training and education", "Hospitality and events management", "Transport and logistics", "Energy and utilities", "Public services and administration", "Information technology", "Business, consulting and management", "Recruitment and HR", "Law enforcement and security", "Marketing, advertising and PR", "Media and internet", "Science and pharmaceuticals"]
Aber die Einbettung ist eine Form (1,512), wahrend ich mehr als eine Berufskategorie habe:
tf.Tensor(
[[ 2.32259948e-02 1.22201964e-02 1.83513425e-02 -5.18600419e-02
...
-1.68162826e-02 -3.61434743e-02 7.69515261e-02 -4.70911013e-03
-9.94433276e-03 -6.31517246e-02 4.47040834e-02 7.09775463e-02
5.70349321e-02 -3.14863063e-02 -1.15674343e-02 4.80637699e-02]], shape=(1, 512), dtype=float32)
Das Beispiel-Array im Google-Notizbuch ist jedoch:
["Willkommen zu einfachen, aber", "verrassend krachtige", "multilingüe", "compréhension du langage naturel", "модели.", "大家是什么意思" , "보다 중요한", ".اللغة التي يتحدثونها"]
Gibt einen Tensor mit einer Form zurück (8.512)