% -*- latex -*-
\documentclass{seminar}
\usepackage{lsalike}
\usepackage{longtable}
\usepackage{cmbright}
\usepackage{path}
\usepackage{skt}
\usepackage[all,ps,dvips]{xy}
\SelectTips{cm}{}
\CompileMatrices
\special{papersize=11in, 8.5in}
\newenvironment{indentLeft}
  {\begin{list}{}{\rightmargin 0pt\leftmargin 0.5in}\item[]}
  {\end{list}}
\newenvironment{indentLeftLess}
  {\begin{list}{}{\rightmargin 0pt\leftmargin 0.275in}\item[]}
  {\end{list}}
\def\pt{\phantom`}
\def\ps{\phantom*}
\def\hx{\hskip1ex\relax}
\def\gap{\vrule height0.4pt depth0pt width0.4cm}
\newcommand{\ip}[2]{#1 <\!\!< #2}

\slideframe{none}
\begin{document}
\begin{slide}\thispagestyle{empty}
\begin{center}
\begin{large}Improving the Efficiency of Parsing\\
Discontinuous Constituents\end{large}\\[12pt]
Mike Daniels\\
6 December 2002
\end{center}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Talk Structure\end{large}\end{center}
\begin{itemize}
\item Background and Motivation
\item Definition of Task
\item Solution and Outlook
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Discontinuous Constituents\end{large}\end{center}
\begin{itemize}
\item Basic idea: the categories dominated by a nonterminal need not
  dominate adjacent strings of terminals.
\item Linked to concepts like tangled trees, linearization, wrapping,
  precedence, order domains, (relatively) free word order, etc.
\item Two examples of alleged discontinuous constituents in English:
\begin{enumerate}
\item He \textbf{picked} his daughter \textbf{up} from school yesterday.
\item He is \textbf{an easy} man \textbf{to please}.
\end{enumerate}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Motivation\end{large}\end{center}
\begin{itemize}
\item There has been a growing acceptance of discontinuous
  constituents in recent years.
\item In particular, one can point to the ongoing development of HPSG
  linearization grammar (e.g. Reape 1993, Kathol 2000, M\"uller
  1999)\nocite{reape:93,kathol:00,smueller:99}.
\item Similarly, one sees increasing use of discontinuous constituents
  throughout syntactic frameworks and treebanks (NEGRA
  \cite{skut:ea:97}, VERBMOBIL \cite{hinrichs-et-al-treebanks-vm00}).
\item One corpus study \cite{kruijff02} found that 1\% of all English
  constituents were discontinuous; for German, Dutch, and Czech, the
  percentages were 11\%, 18\%, and 58\%.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Motivation (Cont.)\end{large}\end{center}
\begin{itemize}
\item As it becomes more natural for linguists to use discontinuous
  constituents to analyze natural language, it becomes more important
  for parsers to be able to handle them efficiently.
\item This efficient handling is currently lacking. Most current
  approaches for parsing HPSG linearization grammars use some variant
  of the generate-and-test paradigm: all constraints on word order are
  removed from the parser and relegated to a constraint solver.
\item The goal of this research was therefore to develop a parser that
  could work directly with grammars using discontinuous constituents
  and effciently parse according to those grammars.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}
  So what does it actually mean to have a grammar that uses
  discontinuous constituents?
\end{center}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Grammar Format\end{large}\end{center}
\begin{flushleft}
  Based on the Linear Specification Language
  \cite{goetz:penn:97,suhre:99}.\\
  We have \textbf{lexical entries} of the form $A \to t$ and
  \textbf{grammar rules} of the form $A \to \alpha; L$, where:\\
\begin{itemize}
\item $\alpha$ is a \textbf{list of non-terminals}.
\item $L$ is a set of \textbf{linearization constraints}:
\begin{itemize}
\item $A < B$ \textbf{(precedence)}: The terminals dominated by $A$
  all occur to the left of the terminals dominated by $B$.
\item $\ip{A}{B}$ \textbf{(immediate precedence)}: The rightmost
  terminal dominated by $B$ occurs immediately to the left of the
  leftmost terminal dominated by $B$.
\item $[A]$ \textbf{(isolation)}: $A$ dominates an uninterrupted
  sequence of terminals in the input.
\end{itemize}
\pagebreak
\item In a normal context-free grammar, the order of the right-hand
  side categories is used to encode linear order.
\item But even though our format is no longer strongly tied to linear
  order, the right-hand side of a grammar rule is still stated as
  a list.
\item The reason: the ordering is used to guide parsing; it need not
  reflect surface order.
\pagebreak
\item For example, the rule\\[6pt]
  s $\to$ verb$_1$ nom$_2$ acc$_3$; $\{$2 $<$ 1, 3 $<$ 1$\}$\\[6pt]
  requires that the verb be parsed first, even though it must follow
  the other two constituents.
\item In part, this format can be seen as relaxing the right-hand
  side of each rule from representing a total order on the categories
  listed to merely a partial order.
\end{itemize}
\end{flushleft}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Sample Grammar\end{large}\end{center}
\begin{tabular}{l|l|l}
1&\multicolumn{2}{l}{s $\to$ verb$_1$ nom$_2$ acc$_3$; $\{2 < 1, 3 <
  1\}$}\\
2&\multicolumn{2}{l}{s $\to$ verb$_1$ nom$_2$; $\{2 < 1\}$}\\
3&\multicolumn{2}{l}{s $\to$ conj$_1$ s$_2$ s$_3$; $\{\ip{2}{1},
  \ip{1}{3}, [2], [3]\}$}\\
4&\multicolumn{2}{l}{acc $\to$ adj$_1$ acc$_2$; $\{~\}$}\\
5&\textit{nalas} $\to$ nom & `Nala' (a proper name)\\
6&\textit{nagaram} $\to$ acc & `city'\\
7&\textit{agacchat} $\to$ verb & `went'\\
8&\textit{ca} $\to$ conj & `and'\\
9&\textit{avadat} $\to$ verb & `spoke'\\
10&\textit{ruciram} $\to$ adj & `shining'\\
\end{tabular}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Example\end{large}\end{center}
\begin{flushleft}
(1) \begin{tabular}[t]{*7{l@{\hskip6pt}}}
\multicolumn{7}{l}{\skt .r8+:Y4a;.ca:=\ZH{-6}{M} na;l+.ea
  na;ga:=+ma;ga;.cC+.c1a na;l+.ea Y;va;d;t,a}\\
ruciram & nalas & nagaram & agacchat & ca & nalas & avadat\\
shining & nala & city & went & and & nala & spoke\\
\multicolumn{7}{l}{`Nala went to the shining city and Nala spoke.'}\\
\end{tabular}
\end{flushleft}
\begin{center}
\begin{small}
\begin{tabular}{c}
\xymatrix@C-2.2pc@R-1.6pc{
&&&&&&&&\ar@{-}[ddllllll]\ar@{-}[dddrrr]\ar@{-}[ddddd]\txt{s}\\
\\
&&\ar@{-}[dll]\ar@{-}[ddd]\ar@{-}[dddrrrr]\ar@{-->}@/^1ex/[dddrrrrrr]\txt{[s]}\\
\ar@{-}[dd]\ar@{-}[ddrrrr]\ar@{-->}@/^1ex/[ddrrrrrr]\txt{acc} &&&&&&&&&&&%
\ar@{-}[ddl]\ar@{-}[ddr]\txt{[s]}\\
\\
\ar@{-}[d]\txt{adj} && \ar@{-}[d]\ar@{-->}@/^2ex/[rrrr]\txt{nom} &&
\ar@{-}[d]\txt{acc} && \ar@{-}[d]\txt{verb} &&
\ar@{-}[d]\ar@{-->}@/^1ex/[uurrr]\txt{conj} &&
\ar@{-}[d]\ar@{-->}@/^1ex/[rr]\txt{nom} && \ar@{-}[d]\txt{verb}\\
{\txt{ruciram}} && {\txt{nalas}} && {\txt{nagaram}} &&
{\txt{agacchat}} && {\txt{ca}} && {\txt{nalas}} && {\txt{avadat}} \\
}
\end{tabular}
\end{small}\\
\begin{tiny}
(Solid lines represent dominance, while dashed arrows represent the
precedence relations enforced by the grammar.)
\end{tiny}
\end{center}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Parser Overview\end{large}\end{center}
\begin{itemize}
\item Representational issues
\item The parsing algorithm itself
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Parsing Framework\end{large}\end{center}
\begin{itemize}
\item Based on Earley's Algorithm: a top-down, goal-directed approach.
\item Also a bottom-up approach: driven by lexical information.
\item Modified to start with lexical seeding and to use an active
  chart.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Edge Coverage: Earley\end{large}\end{center}
\begin{itemize}
\item An active chart contains edges encoding in-process as
  well as completed parses of a (sub)string.
\item Traditionally, edge coverage is modelled by an interval.
\item For instance, $\langle$[0,2], S, [VP]$\rangle$ might represent
  the edge for\\\textit{The dog} in
  \textit{$_0$\kern-2pt The$_1$dog$_2$sleeps$_3$}.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Edge Coverage: GIDLP\end{large}\end{center}
\begin{itemize}
\item GIDLP grammars allow for discontinuity, and so a single interval
  is no longer sufficient: a constituent may dominate any subset of
  the input.
\item Following Johnson (1985)\nocite{johnson:85parsing} and Reape
  (1991)\nocite{reape:91parsing}, we represent edge coverage with
  bitvectors.
\item Each active bit represents the inclusion of the word at that
  position; the left edge of the input corresponds to the left edge of
  the vector.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Encoding Bitvectors\end{large}\end{center}
\begin{itemize}
\item We want to represent the coverage of the discontinuous
  accusative noun phrase \textit{ruciram nagaram} in the example given
  earlier:\\
\begin{tabular}[t]{*7{c@{\hskip6pt}}}
\textbf{ruciram} & nalas & \textbf{nagaram} & agacchat & ca & nalas
& avadat\\
yes & no & yes & no & no & no & no\\
\end{tabular}
\item The following are equivalent in terms of descriptive power:
\begin{itemize}
\item Interval lists (Johnson 1985): \path|[[0,1],[2,3]]|
\item Bit lists (Reape 1991): \path|[1,0,1,0,0,0,0]|
\item Binary numbers: 0000101$_2$ (= 5$_{10}$)\\
\begin{small}
(Notice that the least significant bit of the number corresponds to the
left-most word in the input.)
\end{small}
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Computing With Bitvectors\end{large}\end{center}
\begin{itemize}
\item Parsing requires a number of bitwise operations: overlap,
  lbound, suffix, and so on. (See Appendix A in the handout for a full
  list.)
\item The nature of current hardware allows bitwise operations to be
  computed in parallel in a single processor instruction.
\item Thus it takes just as long to, say, compute the bitwise-AND of
  two 20-bit numbers as it does for two 30-bit numbers.
\item In general, for a processor that has a word size of $n$,
  arithmetic computation time is proportional to the base-$n$
  logarithm of the inputs. Operations on any list-based representation
  will take time proportional to the length of the list itself.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Edge Constraints\end{large}\end{center}
\begin{itemize}
\item We also use bitvectors to encode \textbf{constraints} on
  coverage; bitvectors used this way are called bitmasks.
\item We store a negative and positive bitmask on each edge,
  representing different aspects of precedence information.\\
  Assume we've found a noun as the third word of a sentence:
\begin{itemize}
\item Negative: If we know that verbs follow nouns, we shouldn't look
  for verbs in the first or second position.
\item Positive: If we know that verbs immediately follow nouns, then
  any verb we find must cover at least the fourth position.
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Prediction\end{large}\end{center}
\begin{itemize}
\item Given an edge with a non-empty RHS, consider the first element
  of that RHS and see if any rules have that category as their LHS.
\begin{indentLeft}
  \textit{Consider the edge}\\[6pt]
  $s : s_2\, s_3; \{\ip{s_2}{0010000}, \ip{0010000}{s_3}, [s_2],
  [s_3]\}$,\\
  Coverage = 0010000, NMask = 0010000\\[6pt]
  \textit{which represents a point in the parse where the conjunction
  of rule 3 has been found as the fifth word of the sentence.}\\[6pt]
  \textit{One qualifying rule in the grammar is number 3 again:}\\[6pt]
  $s \to c_1\, s_2\, s_3;\{\ip{2}{1}, \ip{1}{3}\}$.\\
\end{indentLeft}
\pagebreak
\item Compute the masks for each such rule by considering each order
  constraint in turn:
\begin{itemize}
\item For precedence constraints, have we already found one of the
  appropriate categories? If so, apply a left- or right-mask, as
  appropriate.
\begin{indentLeftLess}
  \textit{Here, we're predicting $s_2$, which is mentioned in the
    constraint $\ip{s_2}{0010000}$. Since the predicted element needs
    to precede a known location, we augment the negative mask with
    suffix(lbound(0010000)) = 1110000.}
\end{indentLeftLess}
\pagebreak
\item For immediate precedence, also augment the positive mask with
  the bit adjacent to the appropriate bound of the position.
\begin{indentLeftLess}
  \textit{With the constraint $\ip{s_2}{0010000}$, we use the bit
    adjacent to the left bound of 0010000: we augment the positive
    mask with 0001000.}
\end{indentLeftLess}
\end{itemize}
\item Make sure the negative mask has enough ``holes'' for the RHS of
  the current rule to fit into.
\begin{indentLeft}
  \textit{The negative mask 1110000 does in fact have enough space for
    the three elements of the rule we're predicting.}
\end{indentLeft}
\item Enter the resulting edge into the chart.
\begin{indentLeft}
  \textit{We end up with} $s : c\, s_2\, s_3; \{\ip{s_2}{c},
  \ip{c}{s_3}\},$\\
  Coverage = 0000000, NMask = 1110000,\\ PMask = 0100000\\
\end{indentLeft}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Completion\end{large}\end{center}
\begin{itemize}
\item Given an active and a passive edge:
\begin{indentLeft}
  \textit{Fast-forward to the point where we've recognized that the
  second, third, and fourth words together form a sentence:}\\
  \textit{Active:} $s : s_2 s_3; \{\ip{s_2}{0010000}, \ip{0010000}{s_3},
  [s_2], [s_3]\},$\\
  Coverage = 0010000, NMask = 0010000\\[6pt]
  \textit{Passive:}\\ $s : ~; \{~\}$, Coverage = 0001110\\
\end{indentLeft}
\begin{itemize}
\item Check that the active edge's negative mask does not overlap with
  the passive edge's coverage.
\begin{indentLeftLess}
  \textit{0010000 does not overlap with 0001110.}\\
\end{indentLeftLess}
\pagebreak
\item If we're about to create a passive edge: check that the passive
  edge's coverage is bitwise-implied by the active edge's positive
  mask.
\begin{indentLeftLess}
  \textit{Nothing to check here, since we'll still need to find $s_3$.}\\
\end{indentLeftLess}
\item If the active edge contains an isolation constraint, make sure
  the terminals it would govern are contiguous.
\begin{indentLeftLess}
\textit{0001110 is indeed contiguous.}\\
\end{indentLeftLess}
\pagebreak
\item Compute the new coverage and update the LP constraints.
\begin{indentLeftLess}
  \textit{Combining 0001110 and 0010000 yields 0011110. The LP
    constraints $\ip{s_2}{0010000}$ and $[s_2]$ are discarded, as
    they no longer refer to unfound elements.}\\
\end{indentLeftLess}
\item Add the resulting edge to the chart.
\begin{indentLeftLess}
  $s : s_3; \{\ip{0010000}{s_3}, [s_3]\},$\\
  Coverage = 0010000, NMask = 0010000\\
\end{indentLeftLess}
\end{itemize}
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Summary\end{large}\end{center}
\begin{itemize}
\item Grammar format generalizes ID/LP grammars to allow for
  discontinuous constituents.
\item Parser generalizes Earley's algorithm to allow for discontinuous
  constituents; also allows the processing order of the RHS to be
  specified, thereby generalizing head-driven parsing.
\item Coverage constraints encoded as bitvectors provide an efficient,
  compiled representation of linear precedence information.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Advanced Issues\end{large}\end{center}
\begin{itemize}
\item Global precedence and isolation statements.
\item Chart indexing strategies that avoid the enumeration of all
  bitvectors of length $n$.
\end{itemize}
\end{slide}

\begin{slide}
\begin{center}\begin{large}Future Work\end{large}\end{center}
\begin{itemize}
\item Move to complex categories (first compound terms, then feature
  structures).
\item Large-scale grammar development and evaluation: automatic
  extraction from the NEGRA corpus, and recoding the grammar encoded
  in the Babel program \cite{smueller:96babel}.
\end{itemize}
\end{slide}

%\begin{slide}
%\bibliographystyle{mwd2}
%\bibliography{hpsg}
%\end{slide}
\end{document}
