Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions scoring/src/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,19 @@ def rater_factor_key(i):
gaussianNoteInterceptNoHighVolKey = "gaussianNoteInterceptNoHighVol"
gaussianNoteInterceptNoCorrelatedKey = "gaussianNoteInterceptNoCorrelated"
gaussianNoteInterceptPopulationSampledKey = "gaussianNoteInterceptPopulationSampled"
# Gaussian Core With Topics Model
gaussianCoreWithTopicsNoteInterceptKey = "gaussianCoreWithTopicsNoteIntercept"
gaussianCoreWithTopicsNoteFactor1Key = "gaussianCoreWithTopicsNoteFactor1"
gaussianCoreWithTopicsRatingStatusKey = "gaussianCoreWithTopicsRatingStatus"
gaussianCoreWithTopicsActiveRulesKey = "gaussianCoreWithTopicsActiveRules"
gaussianCoreWithTopicsNumFinalRoundRatingsKey = "gaussianCoreWithTopicsNumFinalRoundRatings"
gaussianCoreWithTopicsNoteInterceptNoHighVolKey = "gaussianCoreWithTopicsNoteInterceptNoHighVol"
gaussianCoreWithTopicsNoteInterceptNoCorrelatedKey = (
"gaussianCoreWithTopicsNoteInterceptNoCorrelated"
)
gaussianCoreWithTopicsNoteInterceptPopulationSampledKey = (
"gaussianCoreWithTopicsNoteInterceptPopulationSampled"
)
# Harassment/Abuse Tag
harassmentNoteInterceptKey = "harassmentNoteIntercept"
harassmentNoteFactor1Key = "harassmentNoteFactor1"
Expand Down Expand Up @@ -394,6 +407,9 @@ def rater_factor_key(i):
aboveHelpfulnessThresholdKey = "aboveHelpfulnessThreshold"
totalHelpfulHarassmentRatingsPenaltyKey = "totalHelpfulHarassmentPenalty"
raterAgreeRatioWithHarassmentAbusePenaltyKey = "raterAgreeRatioKeyWithHarassmentAbusePenalty"
crhTotal14dKey = "crhTotal14d"
crnhTotal14dKey = "crnhTotal14d"
nmrTotal14dKey = "nmrTotal14d"

# Note Status Labels
currentlyRatedHelpful = "CURRENTLY_RATED_HELPFUL"
Expand Down Expand Up @@ -960,6 +976,14 @@ def rater_factor_key(i):
(gaussianNoteInterceptNoHighVolKey, np.double),
(gaussianNoteInterceptPopulationSampledKey, np.double),
(gaussianNumFinalRoundRatingsKey, np.double), # double because nullable.
(gaussianCoreWithTopicsNoteInterceptKey, np.double),
(gaussianCoreWithTopicsNoteFactor1Key, np.double),
(gaussianCoreWithTopicsRatingStatusKey, "category"),
(gaussianCoreWithTopicsActiveRulesKey, "category"),
(gaussianCoreWithTopicsNoteInterceptNoHighVolKey, np.double),
(gaussianCoreWithTopicsNoteInterceptNoCorrelatedKey, np.double),
(gaussianCoreWithTopicsNoteInterceptPopulationSampledKey, np.double),
(gaussianCoreWithTopicsNumFinalRoundRatingsKey, np.double), # double because nullable.
]
noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
Expand Down Expand Up @@ -1049,6 +1073,9 @@ def rater_factor_key(i):
(coreWithTopicsRaterFactor1Key, np.double),
(coreFirstRoundRaterInterceptKey, np.double),
(coreFirstRoundRaterFactor1Key, np.double),
(crhTotal14dKey, pd.Int64Dtype()),
(crnhTotal14dKey, pd.Int64Dtype()),
(nmrTotal14dKey, pd.Int64Dtype()),
]
raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
Expand Down
2 changes: 2 additions & 0 deletions scoring/src/scoring/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Scorers(Enum):
MFTopicScorer = auto()
MFMultiGroupScorer = auto()
GaussianScorer = auto()
GaussianCoreWithTopicsScorer = auto()


class Topics(Enum):
Expand All @@ -26,6 +27,7 @@ class Topics(Enum):
GazaConflict = 2
MessiRonaldo = 3
Scams = 4
InDimensionTwo = 5


def scorers_from_csv(csv: str) -> Set[Scorers]:
Expand Down
83 changes: 83 additions & 0 deletions scoring/src/scoring/gaussian_core_with_topics_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from typing import Dict, List, Optional

from . import constants as c
from .gaussian_scorer import GaussianScorer


class GaussianCoreWithTopicsScorer(GaussianScorer):
"""Gaussian convolution scorer restricted to core groups (with topics variant).

This scorer inherits all Gaussian scoring logic but filters ratings to only
include raters from coreGroups and unassigned raters, mirroring the population
used by MFCoreWithTopicsScorer.
"""

def __init__(
self,
seed: Optional[int] = None,
threads: int = c.defaultNumThreads,
saveIntermediateState: bool = False,
) -> None:
"""Configure GaussianCoreWithTopicsScorer object.

Args:
seed: if not None, seed value to ensure deterministic execution
threads: number of threads to use for intra-op parallelism in pytorch
saveIntermediateState: if True, save intermediate state for debugging
"""
super().__init__(
includedGroups=c.coverageGroups,
excludeTopics=False,
includeUnassigned=True,
captureThreshold=0.5,
seed=seed,
threads=threads,
saveIntermediateState=saveIntermediateState,
)

def get_name(self):
return "GaussianCoreWithTopicsScorer"

def _get_note_col_mapping(self) -> Dict[str, str]:
"""Returns a dict mapping default note column names to custom names for a specific model."""
return {
c.internalNoteInterceptKey: c.gaussianCoreWithTopicsNoteInterceptKey,
c.internalNoteFactor1Key: c.gaussianCoreWithTopicsNoteFactor1Key,
c.internalActiveRulesKey: c.gaussianCoreWithTopicsActiveRulesKey,
c.numFinalRoundRatingsKey: c.gaussianCoreWithTopicsNumFinalRoundRatingsKey,
c.internalNoteInterceptNoHighVolKey: c.gaussianCoreWithTopicsNoteInterceptNoHighVolKey,
c.internalNoteInterceptNoCorrelatedKey: c.gaussianCoreWithTopicsNoteInterceptNoCorrelatedKey,
c.internalNoteInterceptPopulationSampledKey: c.gaussianCoreWithTopicsNoteInterceptPopulationSampledKey,
c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
c.internalRatingStatusKey: c.gaussianCoreWithTopicsRatingStatusKey,
}

def _get_user_col_mapping(self) -> Dict[str, str]:
"""Returns a dict mapping default user column names to custom names for a specific model."""
return {}

def get_scored_notes_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the scoredNotes output."""
return [
c.noteIdKey,
c.gaussianCoreWithTopicsNoteInterceptKey,
c.gaussianCoreWithTopicsNoteFactor1Key,
c.gaussianCoreWithTopicsRatingStatusKey,
c.gaussianCoreWithTopicsActiveRulesKey,
c.gaussianCoreWithTopicsNumFinalRoundRatingsKey,
c.gaussianCoreWithTopicsNoteInterceptNoHighVolKey,
c.gaussianCoreWithTopicsNoteInterceptNoCorrelatedKey,
c.gaussianCoreWithTopicsNoteInterceptPopulationSampledKey,
]

def get_helpfulness_scores_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the helpfulnessScores output."""
return [
c.raterParticipantIdKey,
]

def get_auxiliary_note_info_cols(self) -> List[str]:
"""Returns a list of columns which should be present in the auxiliaryNoteInfo output."""
return [
c.noteIdKey,
]
66 changes: 36 additions & 30 deletions scoring/src/scoring/gaussian_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def __init__(
self._crhParams = crhParams
self._crnhParams = crnhParams
self._useMfNoteParams = useMfNoteParams
self._centeredBins = False

def get_prescoring_name(self):
return "MFCoreScorer"
Expand Down Expand Up @@ -367,7 +368,7 @@ def _get_dropped_note_cols(self) -> List[str]:

def _get_dropped_user_cols(self) -> List[str]:
"""Returns a list of columns which should be excluded from helpfulnessScores output."""
return []
return [c.internalRaterFactor1Key]

def _prepare_data_for_scoring(self, ratings: pd.DataFrame, final: bool = False) -> pd.DataFrame:
"""Prepare data for scoring. This includes filtering out notes and raters which do not meet
Expand Down Expand Up @@ -397,7 +398,7 @@ def _return_all_pts(
params = self._crhParams if isCrh else self._crnhParams

numQuantiles = len(quantileRange)
quantileCols = [f"{x:5.2f}" for x in quantileRange]
quantileCols = [f"{x:5.3f}" for x in quantileRange]
quantileArray = np.array(quantileRange, dtype=np.float32)

assert (
Expand Down Expand Up @@ -523,21 +524,20 @@ def _return_all_pts(
quantileCols
].values

if not isCrh:
# Ensure notes with fewer than 3 ratings on each side get 0.1 smoothing
signCounts = (
ratingsForTrainingWithFactors.assign(
neg=ratingsForTrainingWithFactors[c.internalRaterFactor1Key] < 0,
pos=ratingsForTrainingWithFactors[c.internalRaterFactor1Key] > 0,
)
.groupby(c.noteIdKey)[["neg", "pos"]]
.sum()
.astype(int)
# Ensure notes with fewer than 3 ratings on each side get 0.1 smoothing
signCounts = (
ratingsForTrainingWithFactors.assign(
neg=ratingsForTrainingWithFactors[c.internalRaterFactor1Key] < 0,
pos=ratingsForTrainingWithFactors[c.internalRaterFactor1Key] > 0,
)
insufficientMask = (signCounts["neg"] < 3) | (signCounts["pos"] < 3)
insufficientNoteIds = signCounts[insufficientMask].index
isInsufficient = np.isin(uniqueNotes, insufficientNoteIds)
smoothingValues[isInsufficient] = 0.1
.groupby(c.noteIdKey)[["neg", "pos"]]
.sum()
.astype(int)
)
insufficientMask = (signCounts["neg"] < 3) | (signCounts["pos"] < 3)
insufficientNoteIds = signCounts[insufficientMask].index
isInsufficient = np.isin(uniqueNotes, insufficientNoteIds)
smoothingValues[isInsufficient] = 0.1

# Smoothing weights
if params.adaptiveWeightBase is not None:
Expand Down Expand Up @@ -589,7 +589,7 @@ def _gaussian_kernel_extrapolator_vectorized(
ratingsForTrainingWithFactors, quantileRange, isCrh=isCrh, empiricalPriors=empiricalPriors
)

quantileCols = [f"{x:5.2f}" for x in quantileRange]
quantileCols = [f"{x:5.3f}" for x in quantileRange]

# Compute intercept
logValues = np.log(clippedValues[quantileCols].values)
Expand Down Expand Up @@ -765,31 +765,36 @@ def _score_notes_and_users(
].nunique()
> self._nBinsEachSide
):
_, l_range = pd.qcut(
l_range = (
ratersWithParams.loc[ratersWithParams[c.internalRaterFactor1Key] < 0][
c.internalRaterFactor1Key
],
self._nBinsEachSide,
retbins=True,
]
.quantile(list(np.linspace(0.001, 0.999, self._nBinsEachSide)))
.values
)
_, r_range = pd.qcut(
r_range = (
ratersWithParams.loc[ratersWithParams[c.internalRaterFactor1Key] > 0][
c.internalRaterFactor1Key
],
self._nBinsEachSide,
retbins=True,
]
.quantile(list(np.linspace(0.001, 0.999, self._nBinsEachSide)))
.values
)
lMids = (l_range[:-1] + l_range[1:]) / 2
rMids = (r_range[:-1] + r_range[1:]) / 2
mids = (np.array(sorted(abs(lMids))) + np.array(sorted(abs(rMids)))) / 2
crhQuantileRange = np.concatenate([sorted(-mids), mids])
crnhQuantileRange = np.concatenate([sorted(-mids), mids])
if self._centeredBins:
mids = (np.array(sorted(abs(lMids))) + np.array(sorted(abs(rMids)))) / 2
crhQuantileRange = np.concatenate([sorted(-mids), mids])
crnhQuantileRange = np.concatenate([sorted(-mids), mids])
else:
crhQuantileRange = np.concatenate([lMids, rMids])
crnhQuantileRange = np.concatenate([lMids, rMids])
logger.info(f"crh quantile range: {crhQuantileRange}")
logger.info(f"crnh quantile range: {crnhQuantileRange}")
# if there are not enough unique raters to even calculate bins, do not predict
else:
scoredNotes = pd.DataFrame(columns=self.get_internal_scored_notes_cols())
helpfulnessScores = pd.DataFrame(columns=self.get_internal_helpfulness_scores_cols())
return pd.DataFrame(columns=self.get_internal_scored_notes_cols()), pd.DataFrame(
columns=self.get_internal_helpfulness_scores_cols()
)

else:
crhQuantileRange = c.quantileRange
Expand Down Expand Up @@ -957,6 +962,7 @@ def _score_notes_and_users(
helpfulnessScores = prescoringRaterModelOutput[
[
c.raterParticipantIdKey,
c.internalRaterFactor1Key,
]
]

Expand Down
Loading